[PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask division

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Tamar Christina <tamar.christina@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: nd@arm.com, Richard.Earnshaw@arm.com, Marcus.Shawcroft@arm.com,
	Kyrylo.Tkachov@arm.com, richard.sandiford@arm.com
Subject: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask division
Date: Fri, 23 Sep 2022 10:33:46 +0100	[thread overview]
Message-ID: <Yy19es5TOyWlHsnk@arm.com> (raw)
In-Reply-To: <patch-15779-tamar@arm.com>

[-- Attachment #1: Type: text/plain, Size: 5630 bytes --]

Hi All,

In plenty of image and video processing code it's common to modify pixel values
by a widening operation and then scale them back into range by dividing by 255.

This patch adds an named function to allow us to emit an optimized sequence
when doing an unsigned division that is equivalent to:

   x = y / (2 ^ (bitsize (y)/2)-1)

For SVE2 this means we generate for:

void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
{
  for (int i = 0; i < (n & -16); i+=1)
    pixel[i] = (pixel[i] * level) / 0xff;
}

the following:

        mov     z3.b, #1
.L3:
        ld1b    z0.h, p0/z, [x0, x3]
        mul     z0.h, p1/m, z0.h, z2.h
        addhnb  z1.b, z0.h, z3.h
        addhnb  z0.b, z0.h, z1.h
        st1b    z0.h, p0, [x0, x3]
        inch    x3
        whilelo p0.h, w3, w2
        b.any   .L3

instead of:

.L3:
        ld1b    z0.h, p1/z, [x0, x3]
        mul     z0.h, p0/m, z0.h, z1.h
        umulh   z0.h, p0/m, z0.h, z2.h
        lsr     z0.h, z0.h, #7
        st1b    z0.h, p1, [x0, x3]
        inch    x3
        whilelo p1.h, w3, w2
        b.any   .L3

Which results in significantly faster code.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve2.md (@aarch64_bitmask_udiv<mode>3): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+			      addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+			      lowpart_subreg (<MODE>mode, tmp1,
+					      <VNARROW>mode)));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}




-- 

[-- Attachment #2: rb15813.patch --]
[-- Type: text/plain, Size: 4196 bytes --]

diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+			      addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+			      lowpart_subreg (<MODE>mode, tmp1,
+					      <VNARROW>mode)));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}

next prev parent reply	other threads:[~2022-09-23  9:34 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
2022-06-09  4:40 ` [PATCH 2/2]AArch64 aarch64: Add implementation for pow2 bitmask division Tamar Christina
2022-06-13  9:24 ` [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Richard Biener
2022-06-13  9:39   ` Richard Biener
2022-06-13 10:09     ` Tamar Christina
2022-06-13 11:47       ` Richard Biener
2022-06-13 14:37         ` Tamar Christina
2022-06-14 13:18           ` Richard Biener
2022-06-14 13:38             ` Tamar Christina
2022-06-14 13:42             ` Richard Sandiford
2022-06-14 15:57               ` Tamar Christina
2022-06-14 16:09                 ` Richard Biener
2022-06-22  0:34                 ` Tamar Christina
2022-06-26 19:55                   ` Jeff Law
2022-09-23  9:33 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Tamar Christina
2022-09-23  9:33 ` [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division Tamar Christina
2022-10-31 11:34   ` Tamar Christina
2022-11-09  8:33     ` Tamar Christina
2022-11-09 16:02     ` Kyrylo Tkachov
2022-09-23  9:33 ` Tamar Christina [this message]
2022-10-31 11:34   ` [PATCH 3/4]AArch64 Add SVE2 " Tamar Christina
2022-11-09  8:33     ` Tamar Christina
2022-11-12 12:17   ` Richard Sandiford
2022-09-23  9:34 ` [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT Tamar Christina
2022-10-31 11:34   ` Tamar Christina
2022-11-09  8:33     ` Tamar Christina
2022-11-12 12:25   ` Richard Sandiford
2022-11-12 12:33     ` Richard Sandiford
2022-09-26 10:39 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Richard Biener
2022-10-31 11:34   ` Tamar Christina
2022-10-31 17:12     ` Jeff Law
2022-11-08 17:36     ` Tamar Christina
2022-11-09  8:01       ` Richard Biener
2022-11-09  8:26         ` Tamar Christina
2022-11-09 10:37 ` Kyrylo Tkachov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Yy19es5TOyWlHsnk@arm.com \
    --to=tamar.christina@arm.com \
    --cc=Kyrylo.Tkachov@arm.com \
    --cc=Marcus.Shawcroft@arm.com \
    --cc=Richard.Earnshaw@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=nd@arm.com \
    --cc=richard.sandiford@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).