From: Tamar Christina <tamar.christina@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: nd@arm.com, james.greenhalgh@arm.com, Richard.Earnshaw@arm.com,
Marcus.Shawcroft@arm.com
Subject: [PATCH][GCC][AArch64] Dot Product SIMD patterns [Patch (5/8)]
Date: Fri, 01 Sep 2017 13:22:00 -0000 [thread overview]
Message-ID: <20170901132215.GA32134@arm.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 940 bytes --]
Hi All,
This patch adds the instructions for Dot Product to AArch64 along
with the intrinsics and vectorizer pattern.
Armv8.2-a dot product supports 8-bit element values both
signed and unsigned.
Dot product is available from Arm8.2-a and onwards.
Regtested and bootstrapped on aarch64-none-elf and no issues.
Ok for trunk?
gcc/
2017-09-01 Tamar Christina <tamar.christina@arm.com>
* config/aarch64/aarch64-builtins.c
(aarch64_types_quadopu_lane_qualifiers): New.
(TYPES_QUADOPU_LANE): New.
* config/aarch64/aarch64-simd.md (aarch64_<sur>dot<dot_mode>): New.
(<sur>dot_prod<dot_mode>, aarch64_<sur>dot_lane<dot_mode>): New.
(aarch64_<sur>dot_laneq<dot_mode>): New.
* config/aarch64/aarch64-simd-builtins.def (sdot, udot): New.
(sdot_lane, udot_lane, sdot_laneq, udot_laneq): New.
* config/aarch64/iterators.md (UNSPEC_SDOT, UNSPEC_UDOT): New.
(DOT_MODE, dot_mode, Vdottype, DOTPROD): New.
(sur): Add SDOT and UDOT.
--
[-- Attachment #2: 7945-diff.patch --]
[-- Type: text/x-diff, Size: 6778 bytes --]
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index d30009ba441cabc511ddbc821379daae6de09fa2..a1b598c3da29ca791c261ca8a6f918573a818974 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -168,6 +168,11 @@ aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_none, qualifier_none,
qualifier_none, qualifier_lane_index };
#define TYPES_QUADOP_LANE (aarch64_types_quadop_lane_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_quadopu_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+ qualifier_unsigned, qualifier_lane_index };
+#define TYPES_QUADOPU_LANE (aarch64_types_quadopu_lane_qualifiers)
static enum aarch64_type_qualifiers
aarch64_types_binop_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d713d5d8b88837ec6f2dc51188fb252f8d5bc8bd..52d01342372e518b1238ea14097e8f0574e9a605 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -205,6 +205,14 @@
BUILTIN_VSDQ_I_DI (BINOP, srshl, 0)
BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0)
+ /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>. */
+ BUILTIN_VB (TERNOP, sdot, 0)
+ BUILTIN_VB (TERNOPU, udot, 0)
+ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
+ BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
+ BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
+ BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
+
BUILTIN_VDQ_I (SHIFTIMM, ashr, 3)
VAR1 (SHIFTIMM, ashr_simd, 0, di)
BUILTIN_VDQ_I (SHIFTIMM, lshr, 3)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f3e084f8778d70c82823b92fa80ff96021ad26db..21d46c84ab317c2d62afdf8c48117886aaf483b0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -386,6 +386,87 @@
}
)
+;; These instructions map to the __builtins for the Dot Product operations.
+(define_insn "aarch64_<sur>dot<dot_mode>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (unspec:VS [(match_operand:VS 1 "register_operand" "0")
+ (match_operand:<DOT_MODE> 2 "register_operand" "w")
+ (match_operand:<DOT_MODE> 3 "register_operand" "w")]
+ DOTPROD))]
+ "TARGET_DOTPROD"
+ "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
+ [(set_attr "type" "neon_dot")]
+)
+
+;; These expands map to the Dot Product optab the vectorizer checks for.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;; c = a[i] * b[i];
+;; r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations. However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_expand "<sur>dot_prod<dot_mode>"
+ [(set (match_operand:VS 0 "register_operand")
+ (unspec:VS [(match_operand:<DOT_MODE> 1 "register_operand")
+ (match_operand:<DOT_MODE> 2 "register_operand")
+ (match_operand:VS 3 "register_operand")]
+ DOTPROD))]
+ "TARGET_DOTPROD"
+{
+ emit_insn (
+ gen_aarch64_<sur>dot<dot_mode> (operands[3], operands[3], operands[1],
+ operands[2]));
+ emit_insn (gen_rtx_SET (operands[0], operands[3]));
+ DONE;
+})
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "aarch64_<sur>dot_lane<dot_mode>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (unspec:VS [(match_operand:VS 1 "register_operand" "0")
+ (match_operand:<DOT_MODE> 2 "register_operand" "w")
+ (match_operand:V8QI 3 "register_operand" "<h_con>")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD))]
+ "TARGET_DOTPROD"
+ {
+ operands[4]
+ = GEN_INT (ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
+ return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+ }
+ [(set_attr "type" "neon_dot")]
+)
+
+(define_insn "aarch64_<sur>dot_laneq<dot_mode>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (unspec:VS [(match_operand:VS 1 "register_operand" "0")
+ (match_operand:<DOT_MODE> 2 "register_operand" "w")
+ (match_operand:V16QI 3 "register_operand" "<h_con>")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD))]
+ "TARGET_DOTPROD"
+ {
+ operands[4]
+ = GEN_INT (ENDIAN_LANE_N (V16QImode, INTVAL (operands[4])));
+ return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+ }
+ [(set_attr "type" "neon_dot")]
+)
+
(define_expand "copysign<mode>3"
[(match_operand:VHSDF 0 "register_operand")
(match_operand:VHSDF 1 "register_operand")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index cceb57525c7aa44933419bd317b1f03a7b76f4c4..533c12cca916669195e9b094527ee0de31542b12 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -354,6 +354,8 @@
UNSPEC_SQRDMLSH ; Used in aarch64-simd.md.
UNSPEC_FMAXNM ; Used in aarch64-simd.md.
UNSPEC_FMINNM ; Used in aarch64-simd.md.
+ UNSPEC_SDOT ; Used in aarch64-simd.md.
+ UNSPEC_UDOT ; Used in aarch64-simd.md.
])
;; ------------------------------------------------------------------
@@ -810,6 +812,13 @@
(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+;; Mapping attribute for Dot Product input modes based on result mode.
+(define_mode_attr DOT_MODE [(V2SI "V8QI") (V4SI "V16QI")])
+(define_mode_attr dot_mode [(V2SI "v8qi") (V4SI "v16qi")])
+
+;; Register suffix for DOTPROD input types from the return type.
+(define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
+
;; Sum of lengths of instructions needed to move vector registers of a mode.
(define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
@@ -1039,6 +1048,7 @@
UNSPEC_SHSUB UNSPEC_UHSUB
UNSPEC_SRHSUB UNSPEC_URHSUB])
+(define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
(define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
UNSPEC_SUBHN UNSPEC_RSUBHN])
@@ -1176,6 +1186,7 @@
(UNSPEC_USHLL "u") (UNSPEC_SSHLL "s")
(UNSPEC_URSHL "ur") (UNSPEC_SRSHL "sr")
(UNSPEC_UQRSHL "u") (UNSPEC_SQRSHL "s")
+ (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
])
(define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
next reply other threads:[~2017-09-01 13:22 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-09-01 13:22 Tamar Christina [this message]
2017-09-04 11:01 ` James Greenhalgh
2017-09-05 18:42 ` Tamar Christina
2017-10-06 12:45 ` Tamar Christina
2017-10-12 13:20 ` Richard Earnshaw (lists)
2017-10-12 15:49 ` Tamar Christina
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170901132215.GA32134@arm.com \
--to=tamar.christina@arm.com \
--cc=Marcus.Shawcroft@arm.com \
--cc=Richard.Earnshaw@arm.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=james.greenhalgh@arm.com \
--cc=nd@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).