[PATCH 1/2]AArch64 Add fallback case using sdot for usdot

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Tamar Christina <tamar.christina@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: nd@arm.com, Richard.Earnshaw@arm.com, Marcus.Shawcroft@arm.com,
	Kyrylo.Tkachov@arm.com, richard.sandiford@arm.com
Subject: [PATCH 1/2]AArch64  Add fallback case using sdot for usdot
Date: Thu, 16 Jun 2022 11:48:44 +0100	[thread overview]
Message-ID: <patch-15821-tamar@arm.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 5934 bytes --]

Hi All,

The usdot operation is common in video encoder and decoders including some of
the most widely used ones.

This patch adds a +dotprod version of the optab as a fallback for when you do
have sdot but not usdot available.

The fallback works by adding a bias to the unsigned argument to convert it to
a signed value and then correcting for the bias later on.

Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
signed (assuming both are 8-bit values).  Because the range of a signed byte is
only to 127 we split the bias correction into:

   (x - 128)y + 127y + y

Concretely for:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

we generate:

        movi    v5.16b, 0x7f
        mov     x3, 0
        movi    v4.16b, 0x1
        movi    v3.16b, 0xffffffffffffff80
        movi    v0.4s, 0
.L2:
        ldr     q2, [x2, x3]
        ldr     q1, [x1, x3]
        add     x3, x3, 16
        sub     v2.16b, v2.16b, v3.16b
        sdot    v0.4s, v2.16b, v1.16b
        sdot    v0.4s, v5.16b, v1.16b
        sdot    v0.4s, v4.16b, v1.16b
        cmp     x3, 480
        bne     .L2

instead of:

        movi    v0.4s, 0
        mov     x3, 0
.L2:
        ldr     q2, [x1, x3]
        ldr     q1, [x2, x3]
        add     x3, x3, 16
        sxtl    v4.8h, v2.8b
        sxtl2   v3.8h, v2.16b
        uxtl    v2.8h, v1.8b
        uxtl2   v1.8h, v1.16b
        mul     v2.8h, v2.8h, v4.8h
        mul     v1.8h, v1.8h, v3.8h
        saddw   v0.4s, v0.4s, v2.4h
        saddw2  v0.4s, v0.4s, v2.8h
        saddw   v0.4s, v0.4s, v1.4h
        saddw2  v0.4s, v0.4s, v1.8h
        cmp     x3, 480
        bne     .L2

The new sequence is significantly faster as the operations it uses are well
optimized.  Note that execution tests are already in the mid-end testsuite.

Thanks to James Greenhalgh for the tip-off.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate fallback
	or call original isns ...
	(usdot_prod<vsi2qi>_insn): ...here.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>_insn"
   [(set (match_operand:VS 0 "register_operand" "=w")
 	(plus:VS
 	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
@@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+		      (match_operand:<VSI2QI> 2 "register_operand")]
+	  UNSPEC_USDOT)
+	  (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD || TARGET_I8MM"
+{
+  if (TARGET_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+					      operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+				    gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */




-- 

[-- Attachment #2: rb15821.patch --]
[-- Type: text/plain, Size: 3337 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>_insn"
   [(set (match_operand:VS 0 "register_operand" "=w")
 	(plus:VS
 	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
@@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+		      (match_operand:<VSI2QI> 2 "register_operand")]
+	  UNSPEC_USDOT)
+	  (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD || TARGET_I8MM"
+{
+  if (TARGET_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+					      operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+				    gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */

next             reply	other threads:[~2022-06-16 10:49 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-16 10:48 Tamar Christina [this message]
2022-06-16 10:49 ` [PATCH 2/2] Add SVE " Tamar Christina
2022-06-16 16:09 ` [PATCH 1/2]AArch64 Add " Richard Sandiford
2022-06-16 18:53   ` Richard Sandiford
2022-06-27  5:24     ` Tamar Christina
2022-06-27  6:09       ` Richard Biener
2022-06-28 15:54         ` Tamar Christina
2022-06-29  9:33           ` Richard Biener
2022-06-29 14:35             ` Richard Sandiford
2022-06-30  6:45               ` Richard Biener
2022-07-05  6:08                 ` Richard Sandiford
2022-07-05  7:41                   ` Richard Biener

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=patch-15821-tamar@arm.com \
    --to=tamar.christina@arm.com \
    --cc=Kyrylo.Tkachov@arm.com \
    --cc=Marcus.Shawcroft@arm.com \
    --cc=Richard.Earnshaw@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=nd@arm.com \
    --cc=richard.sandiford@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).