[PATCH 2/2] Add SVE fallback case using sdot for usdot

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Tamar Christina <tamar.christina@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: nd@arm.com, Richard.Earnshaw@arm.com, Marcus.Shawcroft@arm.com,
	Kyrylo.Tkachov@arm.com, richard.sandiford@arm.com
Subject: [PATCH 2/2] Add SVE fallback case using sdot for usdot
Date: Thu, 16 Jun 2022 11:49:17 +0100	[thread overview]
Message-ID: <YqsKrV7XiJf33bNU@arm.com> (raw)
In-Reply-To: <patch-15821-tamar@arm.com>

[-- Attachment #1: Type: text/plain, Size: 5873 bytes --]

Hi All,

The usdot operation is common in video encoder and decoders including some of
the most widely used ones.

This patch adds a +dotprod version of the optab as a fallback for when you do
have sdot but not usdot available.

The fallback works by adding a bias to the unsigned argument to convert it to
a signed value and then correcting for the bias later on.

Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
signed (assuming both are 8-bit values).  Because the range of a signed byte is
only to 127 we split the bias correction into:

   (x - 128)y + 127y + y

Concretely for:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

we generate:

f:
	...
        mov     z6.b, #0
        mov     z5.b, #127
        mov     z4.b, #1
        mov     z3.b, #-128
        ptrue   p1.b, all
        movi    v0.4s, 0
.L2:
        ld1b    z2.b, p0/z, [x1, x3]
        ld1b    z1.b, p0/z, [x2, x3]
        incb    x3
        sel     z1.b, p0, z1.b, z6.b
        whilelo p0.b, w3, w4
        sub     z1.b, z1.b, z3.b
        sdot    z0.s, z1.b, z2.b
        sdot    z0.s, z5.b, z2.b
        sdot    z0.s, z4.b, z2.b
        b.any   .L2

instead of:

f:
        ...
.L2:
        ld1sb   z2.h, p0/z, [x1, x3]
        punpklo p1.h, p0.b
        ld1b    z0.h, p0/z, [x2, x3]
        add     x3, x3, x5
        mul     z0.h, p2/m, z0.h, z2.h
        sunpklo z2.s, z0.h
        sunpkhi z0.s, z0.h
        add     z1.s, p1/m, z1.s, z2.s
        punpkhi p1.h, p0.b
        whilelo p0.h, w3, w4
        add     z1.s, p1/m, z1.s, z0.s
        b.any   .L2

The new sequence is significantly faster as the operations it uses are well
optimized.  Note that execution tests are already in the mid-end testsuite.

Thanks to James Greenhalgh for the tip-off.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar


gcc/ChangeLog:

	* config/aarch64/aarch64-sve.md (@<sur>dot_prod<vsi2qi>): Generate
	fallback or call original isns ...
	(@<sur>dot_prod<vsi2qi>_insn): ...here.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/vusdot-autovec_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd60e65b0c3f05f1c931f03807170f3b9d699de5..ca60416e7d7b1d8848f4ec5a624ae479a12ae5bc 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6887,7 +6887,7 @@ (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "@<sur>dot_prod<vsi2qi>"
+(define_insn "@<sur>dot_prod<vsi2qi>_insn"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
         (plus:VNx4SI_ONLY
 	  (unspec:VNx4SI_ONLY
@@ -6902,6 +6902,43 @@ (define_insn "@<sur>dot_prod<vsi2qi>"
    [(set_attr "movprfx" "*,yes")]
 )
 
+(define_expand "@<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
+        (plus:VNx4SI_ONLY
+	  (unspec:VNx4SI_ONLY
+	    [(match_operand:<VSI2QI> 1 "register_operand")
+	     (match_operand:<VSI2QI> 2 "register_operand")]
+	    DOTPROD_US_ONLY)
+	  (match_operand:VNx4SI_ONLY 3 "register_operand")))]
+  "TARGET_SVE || TARGET_SVE_I8MM"
+{
+  if (TARGET_SVE_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+                                              operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+                                    gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
 	(plus:VNx4SI_ONLY
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cbe6b7eb7bef5a5c4b8e5ac823ebdf1d309f8490
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+noi8mm"
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */




-- 

[-- Attachment #2: rb15822.patch --]
[-- Type: text/plain, Size: 3343 bytes --]

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd60e65b0c3f05f1c931f03807170f3b9d699de5..ca60416e7d7b1d8848f4ec5a624ae479a12ae5bc 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6887,7 +6887,7 @@ (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "@<sur>dot_prod<vsi2qi>"
+(define_insn "@<sur>dot_prod<vsi2qi>_insn"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
         (plus:VNx4SI_ONLY
 	  (unspec:VNx4SI_ONLY
@@ -6902,6 +6902,43 @@ (define_insn "@<sur>dot_prod<vsi2qi>"
    [(set_attr "movprfx" "*,yes")]
 )
 
+(define_expand "@<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
+        (plus:VNx4SI_ONLY
+	  (unspec:VNx4SI_ONLY
+	    [(match_operand:<VSI2QI> 1 "register_operand")
+	     (match_operand:<VSI2QI> 2 "register_operand")]
+	    DOTPROD_US_ONLY)
+	  (match_operand:VNx4SI_ONLY 3 "register_operand")))]
+  "TARGET_SVE || TARGET_SVE_I8MM"
+{
+  if (TARGET_SVE_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+                                              operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+                                    gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
 	(plus:VNx4SI_ONLY
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cbe6b7eb7bef5a5c4b8e5ac823ebdf1d309f8490
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+noi8mm"
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */

next prev parent reply	other threads:[~2022-06-16 10:50 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-16 10:48 [PATCH 1/2]AArch64 Add " Tamar Christina
2022-06-16 10:49 ` Tamar Christina [this message]
2022-06-16 16:09 ` Richard Sandiford
2022-06-16 18:53   ` Richard Sandiford
2022-06-27  5:24     ` Tamar Christina
2022-06-27  6:09       ` Richard Biener
2022-06-28 15:54         ` Tamar Christina
2022-06-29  9:33           ` Richard Biener
2022-06-29 14:35             ` Richard Sandiford
2022-06-30  6:45               ` Richard Biener
2022-07-05  6:08                 ` Richard Sandiford
2022-07-05  7:41                   ` Richard Biener

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YqsKrV7XiJf33bNU@arm.com \
    --to=tamar.christina@arm.com \
    --cc=Kyrylo.Tkachov@arm.com \
    --cc=Marcus.Shawcroft@arm.com \
    --cc=Richard.Earnshaw@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=nd@arm.com \
    --cc=richard.sandiford@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).