Hi All, There's a slight mismatch between the vectorizer optabs and the intrinsics patterns for NEON. The vectorizer expects operands[3] and operands[0] to be the same but the aarch64 intrinsics expanders expect operands[0] and operands[1] to be the same. This means we need different patterns here. This adds a separate usdot vectorizer pattern which just shuffles around the RTL params. There's also an inconsistency between the usdot and (u|s)dot intrinsics RTL patterns which is not corrected here. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-simd.md (usdot_prod): Rename to... (aarch64_usdot): ..This (usdot_prod): New. * config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Use aarch64_usdot. * config/aarch64/aarch64-simd-builtins.def: Likewise. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 063f503ebd96657f017dfaa067cb231991376bda..ac5d4fc7ff1e61d404e66193b629986382ee4ffd 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -374,11 +374,10 @@ BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, NONE) BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) - /* Implemented by _prod. */ + /* Implemented by aarch64_{_lane}{q}. */ BUILTIN_VB (TERNOP, sdot, 0, NONE) BUILTIN_VB (TERNOPU, udot, 0, NONE) - BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) - /* Implemented by aarch64__lane{q}. */ + BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE) BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE) BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 74890989cb3045798bf8d0241467eaaf72238297..7397f1ec5ca0cb9e3cdd5c46772f604e640666e4 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -601,7 +601,7 @@ (define_insn "aarch64_dot" ;; These instructions map to the __builtins for the armv8.6a I8MM usdot ;; (vector) Dot Product operation. -(define_insn "usdot_prod" +(define_insn "aarch64_usdot" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (unspec:VS [(match_operand: 2 "register_operand" "w") @@ -648,6 +648,17 @@ (define_expand "dot_prod" DONE; }) +;; Auto-vectorizer pattern for usdot. The operand[3] and operand[0] are the +;; RMW parameters that when it comes to the vectorizer. +(define_expand "usdot_prod" + [(set (match_operand:VS 0 "register_operand") + (plus:VS (unspec:VS [(match_operand: 1 "register_operand") + (match_operand: 2 "register_operand")] + UNSPEC_USDOT) + (match_operand:VS 3 "register_operand")))] + "TARGET_I8MM" +) + ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_dot_lane" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 00d76ea937ace5763746478cbdfadf6479e0b15a..17e059efb80fa86a8a32127ace4fc7f43e2040a8 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b); + return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b); } __extension__ extern __inline int32x2_t --