From: Tamar Christina <Tamar.Christina@arm.com>
To: Richard Sandiford <Richard.Sandiford@arm.com>
Cc: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>,
nd <nd@arm.com>, Richard Earnshaw <Richard.Earnshaw@arm.com>,
Marcus Shawcroft <Marcus.Shawcroft@arm.com>,
Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: RE: [PATCH 2/4]AArch64: correct usdot vectorizer and intrinsics optabs
Date: Thu, 22 Jul 2021 11:50:23 +0000 [thread overview]
Message-ID: <VI1PR08MB53252A75714C8A8CB892175DFFE49@VI1PR08MB5325.eurprd08.prod.outlook.com> (raw)
In-Reply-To: <mptim15vu51.fsf@arm.com>
[-- Attachment #1: Type: text/plain, Size: 10858 bytes --]
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SUSS,
aarch64_types_ternop_suss_qualifiers): New.
* config/aarch64/aarch64-simd-builtins.def (usdot_prod): Use it.
* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Re-organize RTL.
* config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Use it.
--- inline copy of patch --
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 9ed4b72d005799b8984a858f96d4763e7fa5aa39..f6b41d9c200d6300dee65ba60ae94488231a8a38 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -209,6 +209,10 @@ static enum aarch64_type_qualifiers
aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_suss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_unsigned, qualifier_none, qualifier_none };
+#define TYPES_TERNOP_SUSS (aarch64_types_ternop_suss_qualifiers)
static enum aarch64_type_qualifiers
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b7f1237b1ffd0d4ca283c853be1cc94b9fc35260..3bb45a82945b143497035ec30d35543b2dad55a3 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -377,7 +377,7 @@
/* Implemented by <sur><dotprod>_prod<dot_mode>. */
BUILTIN_VB (TERNOP, sdot, 0, NONE)
BUILTIN_VB (TERNOPU, udot, 0, NONE)
- BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
+ BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE)
/* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */
BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7332a735d35846e0d9375ad2686ed7ecdb09cd29..bf667b99944e3fcce618a21c77bd5b804b3a0b5d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -599,20 +599,6 @@ (define_insn "aarch64_<sur>dot<vsi2qi>"
[(set_attr "type" "neon_dot<q>")]
)
-;; These instructions map to the __builtins for the armv8.6a I8MM usdot
-;; (vector) Dot Product operation.
-(define_insn "usdot_prod<vsi2qi>"
- [(set (match_operand:VS 0 "register_operand" "=w")
- (plus:VS
- (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
- (match_operand:<VSI2QI> 3 "register_operand" "w")]
- UNSPEC_USDOT)
- (match_operand:VS 1 "register_operand" "0")))]
- "TARGET_I8MM"
- "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
- [(set_attr "type" "neon_dot<q>")]
-)
-
;; These expands map to the Dot Product optab the vectorizer checks for.
;; The auto-vectorizer expects a dot product builtin that also does an
;; accumulation into the provided register.
@@ -648,6 +634,20 @@ (define_expand "<sur>dot_prod<vsi2qi>"
DONE;
})
+;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
+;; (vector) Dot Product operation and the vectorized optab.
+(define_insn "usdot_prod<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
+ (match_operand:<VSI2QI> 2 "register_operand" "w")]
+ UNSPEC_USDOT)
+ (match_operand:VS 3 "register_operand" "0")))]
+ "TARGET_I8MM"
+ "usdot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
+ [(set_attr "type" "neon_dot<q>")]
+)
+
;; These instructions map to the __builtins for the Dot Product
;; indexed operations.
(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 1048d7c7eaac14554142eaa7544159a50929b7f1..8396e872580bc9fb32b872f3915485b02ec2b334 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34021,14 +34021,14 @@ __extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
{
- return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
+ return __builtin_aarch64_usdot_prodv8qi_suss (__a, __b, __r);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
{
- return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b);
+ return __builtin_aarch64_usdot_prodv16qi_suss (__a, __b, __r);
}
__extension__ extern __inline int32x2_t
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, July 20, 2021 5:16 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/4]AArch64: correct usdot vectorizer and intrinsics
> optabs
>
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Thursday, July 15, 2021 8:35 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH 2/4]AArch64: correct usdot vectorizer and
> >> intrinsics optabs
> >>
> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> > Hi All,
> >> >
> >> > There's a slight mismatch between the vectorizer optabs and the
> >> > intrinsics patterns for NEON. The vectorizer expects operands[3]
> >> > and operands[0] to be the same but the aarch64 intrinsics expanders
> >> > expect operands[0] and operands[1] to be the same.
> >> >
> >> > This means we need different patterns here. This adds a separate
> >> > usdot vectorizer pattern which just shuffles around the RTL params.
> >> >
> >> > There's also an inconsistency between the usdot and (u|s)dot
> >> > intrinsics RTL patterns which is not corrected here.
> >> >
> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >> >
> >> > Ok for master?
> >>
> >> Couldn't we just change:
> >>
> >> > diff --git a/gcc/config/aarch64/arm_neon.h
> >> > b/gcc/config/aarch64/arm_neon.h index
> >> >
> >>
> 00d76ea937ace5763746478cbdfadf6479e0b15a..17e059efb80fa86a8a32127ac
> >> e4f
> >> > c7f43e2040a8 100644
> >> > --- a/gcc/config/aarch64/arm_neon.h
> >> > +++ b/gcc/config/aarch64/arm_neon.h
> >> > @@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t
> >> > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >> > vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) {
> >> > - return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
> >> > + return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
> >>
> >> …this to __builtin_aarch64_usdot_prodv8qi_ssus (__a, __b, __r) etc.?
> >
> > Not easily, as I was mentioning before, Neon intrinsics have the
> > assumption that operands[0] and operands[1] are the same. And this
> > goes much further than just the header call.
> >
> > The actual type is determined by the optabs and the C stubs that are
> generated.
> >
> > aarch64_init_simd_builtins which creates the C function stubs starts
> > processing arguments from the end and on non-void functions assumes
> > that the value at operands[0] be the return type. So simply moving __r
> > will get it to think that the result type should be uint8x8_t.
>
> Yeah, the mode of operand 0 (i.e. the output) determines the return type.
> But that mode isn't changing, so the return type will be correct for both input
> operand orders. It works for me locally with:
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index 88fa5ba5a44..5987d9af7c6 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -610,12 +610,12 @@ (define_expand "cmul<conj_op><mode>3"
> ;; and so the vectorizer provides r, in which the result has to be accumulated.
> (define_insn "<sur>dot_prod<vsi2qi>"
> [(set (match_operand:VS 0 "register_operand" "=w")
> - (plus:VS (match_operand:VS 1 "register_operand" "0")
> - (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand"
> "w")
> - (match_operand:<VSI2QI> 3 "register_operand"
> "w")]
> - DOTPROD)))]
> + (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1
> "register_operand" "w")
> + (match_operand:<VSI2QI> 2 "register_operand"
> "w")]
> + DOTPROD)
> + (match_operand:VS 3 "register_operand" "0")))]
> "TARGET_DOTPROD"
> - "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
> + "<sur>dot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
> [(set_attr "type" "neon_dot<q>")]
> )
>
> diff --git a/gcc/config/aarch64/arm_neon.h
> b/gcc/config/aarch64/arm_neon.h index 597f44ce106..64b6d43a1a0 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -31767,28 +31767,28 @@ __extension__ extern __inline uint32x2_t
> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) {
> - return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b);
> + return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r);
> }
>
> __extension__ extern __inline uint32x4_t __attribute__
> ((__always_inline__, __gnu_inline__, __artificial__))
> vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) {
> - return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b);
> + return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r);
> }
>
> __extension__ extern __inline int32x2_t __attribute__
> ((__always_inline__, __gnu_inline__, __artificial__))
> vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) {
> - return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b);
> + return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r);
> }
>
> __extension__ extern __inline int32x4_t __attribute__
> ((__always_inline__, __gnu_inline__, __artificial__))
> vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) {
> - return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b);
> + return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r);
> }
>
> __extension__ extern __inline uint32x2_t
>
> Thanks,
> Richard
[-- Attachment #2: rb14659.patch --]
[-- Type: application/octet-stream, Size: 4289 bytes --]
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 9ed4b72d005799b8984a858f96d4763e7fa5aa39..f6b41d9c200d6300dee65ba60ae94488231a8a38 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -209,6 +209,10 @@ static enum aarch64_type_qualifiers
aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_suss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_unsigned, qualifier_none, qualifier_none };
+#define TYPES_TERNOP_SUSS (aarch64_types_ternop_suss_qualifiers)
static enum aarch64_type_qualifiers
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b7f1237b1ffd0d4ca283c853be1cc94b9fc35260..3bb45a82945b143497035ec30d35543b2dad55a3 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -377,7 +377,7 @@
/* Implemented by <sur><dotprod>_prod<dot_mode>. */
BUILTIN_VB (TERNOP, sdot, 0, NONE)
BUILTIN_VB (TERNOPU, udot, 0, NONE)
- BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
+ BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE)
/* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */
BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7332a735d35846e0d9375ad2686ed7ecdb09cd29..bf667b99944e3fcce618a21c77bd5b804b3a0b5d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -599,20 +599,6 @@ (define_insn "aarch64_<sur>dot<vsi2qi>"
[(set_attr "type" "neon_dot<q>")]
)
-;; These instructions map to the __builtins for the armv8.6a I8MM usdot
-;; (vector) Dot Product operation.
-(define_insn "usdot_prod<vsi2qi>"
- [(set (match_operand:VS 0 "register_operand" "=w")
- (plus:VS
- (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
- (match_operand:<VSI2QI> 3 "register_operand" "w")]
- UNSPEC_USDOT)
- (match_operand:VS 1 "register_operand" "0")))]
- "TARGET_I8MM"
- "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
- [(set_attr "type" "neon_dot<q>")]
-)
-
;; These expands map to the Dot Product optab the vectorizer checks for.
;; The auto-vectorizer expects a dot product builtin that also does an
;; accumulation into the provided register.
@@ -648,6 +634,20 @@ (define_expand "<sur>dot_prod<vsi2qi>"
DONE;
})
+;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
+;; (vector) Dot Product operation and the vectorized optab.
+(define_insn "usdot_prod<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
+ (match_operand:<VSI2QI> 2 "register_operand" "w")]
+ UNSPEC_USDOT)
+ (match_operand:VS 3 "register_operand" "0")))]
+ "TARGET_I8MM"
+ "usdot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
+ [(set_attr "type" "neon_dot<q>")]
+)
+
;; These instructions map to the __builtins for the Dot Product
;; indexed operations.
(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 1048d7c7eaac14554142eaa7544159a50929b7f1..8396e872580bc9fb32b872f3915485b02ec2b334 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34021,14 +34021,14 @@ __extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
{
- return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
+ return __builtin_aarch64_usdot_prodv8qi_suss (__a, __b, __r);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
{
- return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b);
+ return __builtin_aarch64_usdot_prodv16qi_suss (__a, __b, __r);
}
__extension__ extern __inline int32x2_t
next prev parent reply other threads:[~2021-07-22 11:50 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-07-15 16:39 [PATCH 1/4][committed] testsuite: Fix testisms in scalar tests PR101457 Tamar Christina
2021-07-15 16:39 ` [PATCH 2/4]AArch64: correct usdot vectorizer and intrinsics optabs Tamar Christina
2021-07-15 19:34 ` Richard Sandiford
2021-07-20 12:34 ` Tamar Christina
2021-07-20 16:15 ` Richard Sandiford
2021-07-22 11:50 ` Tamar Christina [this message]
2021-07-22 18:09 ` Richard Sandiford
2021-07-15 16:40 ` [PATCH 3/4]AArch64: correct dot-product RTL patterns for aarch64 Tamar Christina
2021-07-15 19:44 ` Richard Sandiford
2021-07-22 11:51 ` Tamar Christina
2021-07-22 18:11 ` Richard Sandiford
2021-07-23 8:14 ` Tamar Christina
2021-07-26 13:56 ` Richard Sandiford
2021-07-15 16:40 ` [PATCH 4/4][AArch32]: correct dot-product RTL patterns Tamar Christina
2021-07-16 2:20 ` [PATCH 1/4][committed] testsuite: Fix testisms in scalar tests PR101457 H.J. Lu
2021-07-16 8:42 ` Tamar Christina
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=VI1PR08MB53252A75714C8A8CB892175DFFE49@VI1PR08MB5325.eurprd08.prod.outlook.com \
--to=tamar.christina@arm.com \
--cc=Kyrylo.Tkachov@arm.com \
--cc=Marcus.Shawcroft@arm.com \
--cc=Richard.Earnshaw@arm.com \
--cc=Richard.Sandiford@arm.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=nd@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).