Hi All,

There's a slight mismatch between the vectorizer optabs and the intrinsics
patterns for NEON.  The vectorizer expects operands[3] and operands[0] to be
the same but the aarch64 intrinsics expanders expect operands[0] and
operands[1] to be the same.

This means we need different patterns here.  This adds a separate usdot
vectorizer pattern which just shuffles around the RTL params.

There's also an inconsistency between the usdot and (u|s)dot intrinsics RTL
patterns which is not corrected here.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Rename to...
	(aarch64_usdot<vsi2qi>): ..This
	(usdot_prod<vsi2qi>): New.
	* config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Use
	aarch64_usdot<vsi2qi>.
	* config/aarch64/aarch64-simd-builtins.def: Likewise.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 063f503ebd96657f017dfaa067cb231991376bda..ac5d4fc7ff1e61d404e66193b629986382ee4ffd 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -374,11 +374,10 @@
   BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, NONE)
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
 
-  /* Implemented by <sur><dotprod>_prod<dot_mode>.  */
+  /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
   BUILTIN_VB (TERNOP, sdot, 0, NONE)
   BUILTIN_VB (TERNOPU, udot, 0, NONE)
-  BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
-  /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>.  */
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE)
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE)
   BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 74890989cb3045798bf8d0241467eaaf72238297..7397f1ec5ca0cb9e3cdd5c46772f604e640666e4 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -601,7 +601,7 @@ (define_insn "aarch64_<sur>dot<vsi2qi>"
 
 ;; These instructions map to the __builtins for the armv8.6a I8MM usdot
 ;; (vector) Dot Product operation.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "aarch64_usdot<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
 	(plus:VS
 	  (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
@@ -648,6 +648,17 @@ (define_expand "<sur>dot_prod<vsi2qi>"
   DONE;
 })
 
+;; Auto-vectorizer pattern for usdot.  The operand[3] and operand[0] are the
+;; RMW parameters that when it comes to the vectorizer.
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+			    (match_operand:<VSI2QI> 2 "register_operand")]
+		 UNSPEC_USDOT)
+		 (match_operand:VS 3 "register_operand")))]
+  "TARGET_I8MM"
+)
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 00d76ea937ace5763746478cbdfadf6479e0b15a..17e059efb80fa86a8a32127ace4fc7f43e2040a8 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
+  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b);
+  return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
 }
 
 __extension__ extern __inline int32x2_t


--