From: Tamar Christina <tamar.christina@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: nd@arm.com, Richard.Earnshaw@arm.com, Marcus.Shawcroft@arm.com,
Kyrylo.Tkachov@arm.com, richard.sandiford@arm.com
Subject: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot
Date: Thu, 16 Jun 2022 11:48:44 +0100 [thread overview]
Message-ID: <patch-15821-tamar@arm.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 5934 bytes --]
Hi All,
The usdot operation is common in video encoder and decoders including some of
the most widely used ones.
This patch adds a +dotprod version of the optab as a fallback for when you do
have sdot but not usdot available.
The fallback works by adding a bias to the unsigned argument to convert it to
a signed value and then correcting for the bias later on.
Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
signed (assuming both are 8-bit values). Because the range of a signed byte is
only to 127 we split the bias correction into:
(x - 128)y + 127y + y
Concretely for:
#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned
SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
SIGNEDNESS_4 char *restrict b)
{
for (__INTPTR_TYPE__ i = 0; i < N; ++i)
{
int av = a[i];
int bv = b[i];
SIGNEDNESS_2 short mult = av * bv;
res += mult;
}
return res;
}
we generate:
movi v5.16b, 0x7f
mov x3, 0
movi v4.16b, 0x1
movi v3.16b, 0xffffffffffffff80
movi v0.4s, 0
.L2:
ldr q2, [x2, x3]
ldr q1, [x1, x3]
add x3, x3, 16
sub v2.16b, v2.16b, v3.16b
sdot v0.4s, v2.16b, v1.16b
sdot v0.4s, v5.16b, v1.16b
sdot v0.4s, v4.16b, v1.16b
cmp x3, 480
bne .L2
instead of:
movi v0.4s, 0
mov x3, 0
.L2:
ldr q2, [x1, x3]
ldr q1, [x2, x3]
add x3, x3, 16
sxtl v4.8h, v2.8b
sxtl2 v3.8h, v2.16b
uxtl v2.8h, v1.8b
uxtl2 v1.8h, v1.16b
mul v2.8h, v2.8h, v4.8h
mul v1.8h, v1.8h, v3.8h
saddw v0.4s, v0.4s, v2.4h
saddw2 v0.4s, v0.4s, v2.8h
saddw v0.4s, v0.4s, v1.4h
saddw2 v0.4s, v0.4s, v1.8h
cmp x3, 480
bne .L2
The new sequence is significantly faster as the operations it uses are well
optimized. Note that execution tests are already in the mid-end testsuite.
Thanks to James Greenhalgh for the tip-off.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate fallback
or call original isns ...
(usdot_prod<vsi2qi>_insn): ...here.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
--- inline copy of patch --
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>_insn"
[(set (match_operand:VS 0 "register_operand" "=w")
(plus:VS
(unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
@@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
[(set_attr "type" "neon_dot<q>")]
)
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+ (match_operand:<VSI2QI> 2 "register_operand")]
+ UNSPEC_USDOT)
+ (match_operand:VS 3 "register_operand")))]
+ "TARGET_DOTPROD || TARGET_I8MM"
+{
+ if (TARGET_I8MM)
+ {
+ emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+ }
+
+ machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+ HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+ rtx signbit = gen_int_mode (val, elemmode);
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+ rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+ gen_int_mode (val - 1, elemmode));
+ rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+ rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+ c1 = force_reg (<VSI2QI>mode, c1);
+ c2 = force_reg (<VSI2QI>mode, c2);
+ dup = force_reg (<VSI2QI>mode, dup);
+ emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+ emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+ emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+ emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+ DONE;
+})
+
;; These instructions map to the __builtins for the Dot Product
;; indexed operations.
(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+ SIGNEDNESS_4 char *restrict b)
+{
+ for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+ {
+ int av = a[i];
+ int bv = b[i];
+ SIGNEDNESS_2 short mult = av * bv;
+ res += mult;
+ }
+ return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
--
[-- Attachment #2: rb15821.patch --]
[-- Type: text/plain, Size: 3337 bytes --]
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>_insn"
[(set (match_operand:VS 0 "register_operand" "=w")
(plus:VS
(unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
@@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
[(set_attr "type" "neon_dot<q>")]
)
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+ (match_operand:<VSI2QI> 2 "register_operand")]
+ UNSPEC_USDOT)
+ (match_operand:VS 3 "register_operand")))]
+ "TARGET_DOTPROD || TARGET_I8MM"
+{
+ if (TARGET_I8MM)
+ {
+ emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+ }
+
+ machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+ HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+ rtx signbit = gen_int_mode (val, elemmode);
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+ rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+ gen_int_mode (val - 1, elemmode));
+ rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+ rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+ c1 = force_reg (<VSI2QI>mode, c1);
+ c2 = force_reg (<VSI2QI>mode, c2);
+ dup = force_reg (<VSI2QI>mode, dup);
+ emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+ emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+ emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+ emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+ DONE;
+})
+
;; These instructions map to the __builtins for the Dot Product
;; indexed operations.
(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+ SIGNEDNESS_4 char *restrict b)
+{
+ for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+ {
+ int av = a[i];
+ int bv = b[i];
+ SIGNEDNESS_2 short mult = av * bv;
+ res += mult;
+ }
+ return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */
next reply other threads:[~2022-06-16 10:49 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-06-16 10:48 Tamar Christina [this message]
2022-06-16 10:49 ` [PATCH 2/2] Add SVE " Tamar Christina
2022-06-16 16:09 ` [PATCH 1/2]AArch64 Add " Richard Sandiford
2022-06-16 18:53 ` Richard Sandiford
2022-06-27 5:24 ` Tamar Christina
2022-06-27 6:09 ` Richard Biener
2022-06-28 15:54 ` Tamar Christina
2022-06-29 9:33 ` Richard Biener
2022-06-29 14:35 ` Richard Sandiford
2022-06-30 6:45 ` Richard Biener
2022-07-05 6:08 ` Richard Sandiford
2022-07-05 7:41 ` Richard Biener
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=patch-15821-tamar@arm.com \
--to=tamar.christina@arm.com \
--cc=Kyrylo.Tkachov@arm.com \
--cc=Marcus.Shawcroft@arm.com \
--cc=Richard.Earnshaw@arm.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=nd@arm.com \
--cc=richard.sandiford@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).