[gcc r12-2508] AArch64: correct dot-product RTL patterns for aarch64.

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r12-2508] AArch64: correct dot-product RTL patterns for aarch64.
@ 2021-07-26  9:28 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2021-07-26  9:28 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:1ab2270036dc0f2a13442ce682267bc7433ffb34

commit r12-2508-g1ab2270036dc0f2a13442ce682267bc7433ffb34
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Mon Jul 26 10:23:21 2021 +0100

    AArch64: correct dot-product RTL patterns for aarch64.
    
    The previous fix for this problem was wrong due to a subtle difference between
    where NEON expects the RMW values and where intrinsics expects them.
    
    The insn pattern is modeled after the intrinsics and so needs an expand for
    the vectorizer optab to switch the RTL.
    
    However operand[3] is not expected to be written to so the current pattern is
    bogus.
    
    Instead I rewrite the RTL to be in canonical ordering and merge them.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd-builtins.def (sdot, udot): Rename to..
            (sdot_prod, udot_prod): ... This.
            * config/aarch64/aarch64-simd.md (aarch64_<sur>dot<vsi2qi>): Merged
            into...
            (<sur>dot_prod<vsi2qi>): ... this.
            (aarch64_<sur>dot_lane<vsi2qi>, aarch64_<sur>dot_laneq<vsi2qi>):
            Change operands order.
            (<sur>sadv16qi): Use new operands order.
            * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32,
            vdotq_s32): Use new RTL ordering.

Diff:
---
 gcc/config/aarch64/aarch64-simd-builtins.def |  4 +-
 gcc/config/aarch64/aarch64-simd.md           | 63 +++++++++++-----------------
 gcc/config/aarch64/arm_neon.h                |  8 ++--
 3 files changed, 31 insertions(+), 44 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 3bb45a82945..402453aa9bb 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -375,8 +375,8 @@
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
 
   /* Implemented by <sur><dotprod>_prod<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot, 0, NONE)
-  BUILTIN_VB (TERNOPU, udot, 0, NONE)
+  BUILTIN_VB (TERNOP, sdot_prod, 10, NONE)
+  BUILTIN_VB (TERNOPU, udot_prod, 10, NONE)
   BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE)
   /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>.  */
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bf667b99944..13c86984df1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -587,19 +587,8 @@
   DONE;
 })
 
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "aarch64_<sur>dot<vsi2qi>"
-  [(set (match_operand:VS 0 "register_operand" "=w")
-	(plus:VS (match_operand:VS 1 "register_operand" "0")
-		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
-			    (match_operand:<VSI2QI> 3 "register_operand" "w")]
-		DOTPROD)))]
-  "TARGET_DOTPROD"
-  "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
-  [(set_attr "type" "neon_dot<q>")]
-)
-
-;; These expands map to the Dot Product optab the vectorizer checks for.
+;; These expands map to the Dot Product optab the vectorizer checks for
+;; and to the intrinsics patttern.
 ;; The auto-vectorizer expects a dot product builtin that also does an
 ;; accumulation into the provided register.
 ;; Given the following pattern
@@ -619,20 +608,17 @@
 ;; ...
 ;;
 ;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sur>dot_prod<vsi2qi>"
-  [(set (match_operand:VS 0 "register_operand")
-	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
-			    (match_operand:<VSI2QI> 2 "register_operand")]
-		 DOTPROD)
-		(match_operand:VS 3 "register_operand")))]
+(define_insn "<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
+		      (match_operand:<VSI2QI> 2 "register_operand" "w")]
+		      DOTPROD)
+	  (match_operand:VS 3 "register_operand" "0")))]
   "TARGET_DOTPROD"
-{
-  emit_insn (
-    gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
-				    operands[2]));
-  emit_insn (gen_rtx_SET (operands[0], operands[3]));
-  DONE;
-})
+  "<sur>dot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
@@ -652,11 +638,12 @@
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
-	(plus:VS (match_operand:VS 1 "register_operand" "0")
-		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
-			    (match_operand:V8QI 3 "register_operand" "<h_con>")
-			    (match_operand:SI 4 "immediate_operand" "i")]
-		DOTPROD)))]
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+		      (match_operand:V8QI 3 "register_operand" "<h_con>")
+		      (match_operand:SI 4 "immediate_operand" "i")]
+		      DOTPROD)
+	  (match_operand:VS 1 "register_operand" "0")))]
   "TARGET_DOTPROD"
   {
     operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4]));
@@ -667,11 +654,12 @@
 
 (define_insn "aarch64_<sur>dot_laneq<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
-	(plus:VS (match_operand:VS 1 "register_operand" "0")
-		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
-			    (match_operand:V16QI 3 "register_operand" "<h_con>")
-			    (match_operand:SI 4 "immediate_operand" "i")]
-		DOTPROD)))]
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+		      (match_operand:V16QI 3 "register_operand" "<h_con>")
+		      (match_operand:SI 4 "immediate_operand" "i")]
+		      DOTPROD)
+	  (match_operand:VS 1 "register_operand" "0")))]
   "TARGET_DOTPROD"
   {
     operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4]));
@@ -944,8 +932,7 @@
 	rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
 	rtx abd = gen_reg_rtx (V16QImode);
 	emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2]));
-	emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
-					  abd, ones));
+	emit_insn (gen_udot_prodv16qi (operands[0], abd, ones, operands[3]));
 	DONE;
       }
     rtx reduc = gen_reg_rtx (V8HImode);
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 0f439943ffe..313b35fc20e 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -31472,28 +31472,28 @@ __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b);
+  return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b);
+  return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_sdotv8qi (__r, __a, __b);
+  return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_sdotv16qi (__r, __a, __b);
+  return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r);
 }
 
 __extension__ extern __inline uint32x2_t


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-07-26  9:28 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-26  9:28 [gcc r12-2508] AArch64: correct dot-product RTL patterns for aarch64 Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).