Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (sdot, udot): Rename to.. (sdot_prod, udot_prod): ... This. * config/aarch64/aarch64-simd.md (aarch64_dot): Merged into... (dot_prod): ... this. (aarch64_dot_lane, aarch64_dot_laneq): Change operands order. (sadv16qi): Use new operands order. * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32): Use new RTL ordering. --- inline copy of patch --- diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 3bb45a82945b143497035ec30d35543b2dad55a3..402453aa9bba5949da43c984c4603196b1efd092 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -375,8 +375,8 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by _prod. */ - BUILTIN_VB (TERNOP, sdot, 0, NONE) - BUILTIN_VB (TERNOPU, udot, 0, NONE) + BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) + BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE) /* Implemented by aarch64__lane{q}. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bf667b99944e3fcce618a21c77bd5b804b3a0b5d..13c86984df147f2033b81a2a5278252f5ac52779 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -587,19 +587,8 @@ (define_expand "cmul3" DONE; }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "aarch64_dot" - [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand: 2 "register_operand" "w") - (match_operand: 3 "register_operand" "w")] - DOTPROD)))] - "TARGET_DOTPROD" - "dot\\t%0., %2., %3." - [(set_attr "type" "neon_dot")] -) - -;; These expands map to the Dot Product optab the vectorizer checks for. +;; These expands map to the Dot Product optab the vectorizer checks for +;; and to the intrinsics patttern. ;; The auto-vectorizer expects a dot product builtin that also does an ;; accumulation into the provided register. ;; Given the following pattern @@ -619,20 +608,17 @@ (define_insn "aarch64_dot" ;; ... ;; ;; and so the vectorizer provides r, in which the result has to be accumulated. -(define_expand "dot_prod" - [(set (match_operand:VS 0 "register_operand") - (plus:VS (unspec:VS [(match_operand: 1 "register_operand") - (match_operand: 2 "register_operand")] - DOTPROD) - (match_operand:VS 3 "register_operand")))] +(define_insn "dot_prod" + [(set (match_operand:VS 0 "register_operand" "=w") + (plus:VS + (unspec:VS [(match_operand: 1 "register_operand" "w") + (match_operand: 2 "register_operand" "w")] + DOTPROD) + (match_operand:VS 3 "register_operand" "0")))] "TARGET_DOTPROD" -{ - emit_insn ( - gen_aarch64_dot (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) + "dot\\t%0., %1., %2." + [(set_attr "type" "neon_dot")] +) ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot ;; (vector) Dot Product operation and the vectorized optab. @@ -652,11 +638,12 @@ (define_insn "usdot_prod" ;; indexed operations. (define_insn "aarch64_dot_lane" [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand: 2 "register_operand" "w") - (match_operand:V8QI 3 "register_operand" "") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VS + (unspec:VS [(match_operand: 2 "register_operand" "w") + (match_operand:V8QI 3 "register_operand" "") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VS 1 "register_operand" "0")))] "TARGET_DOTPROD" { operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4])); @@ -667,11 +654,12 @@ (define_insn "aarch64_dot_lane" (define_insn "aarch64_dot_laneq" [(set (match_operand:VS 0 "register_operand" "=w") - (plus:VS (match_operand:VS 1 "register_operand" "0") - (unspec:VS [(match_operand: 2 "register_operand" "w") - (match_operand:V16QI 3 "register_operand" "") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VS + (unspec:VS [(match_operand: 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VS 1 "register_operand" "0")))] "TARGET_DOTPROD" { operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4])); @@ -944,8 +932,7 @@ (define_expand "sadv16qi" rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], - abd, ones)); + emit_insn (gen_udot_prodv16qi (operands[0], abd, ones, operands[3])); DONE; } rtx reduc = gen_reg_rtx (V8HImode); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 8396e872580bc9fb32b872f3915485b02ec2b334..08bede79ad252b3728fdb278036a4de73696a5db 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -31749,28 +31749,28 @@ __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sdotv8qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdotv16qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r); } __extension__ extern __inline uint32x2_t > -----Original Message----- > From: Richard Sandiford > Sent: Thursday, July 15, 2021 8:45 PM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > ; Marcus Shawcroft > ; Kyrylo Tkachov > Subject: Re: [PATCH 3/4]AArch64: correct dot-product RTL patterns for > aarch64. > > Tamar Christina writes: > > Hi All, > > > > The previous fix for this problem was wrong due to a subtle difference > > between where NEON expects the RMW values and where intrinsics > expects them. > > > > The insn pattern is modeled after the intrinsics and so needs an > > expand for the vectorizer optab to switch the RTL. > > > > However operand[3] is not expected to be written to so the current > > pattern is bogus. > > > > Instead we use the expand to shuffle around the RTL. > > > > The vectorizer expects operands[3] and operands[0] to be the same but > > the aarch64 intrinsics expanders expect operands[0] and operands[1] to > > be the same. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? and active branches after some stew? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-simd.md (dot_prod): Correct > > RTL. > > > > --- inline copy of patch -- > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index > > > 7397f1ec5ca0cb9e3cdd5c46772f604e640666e4..51789f954affd9fa88e2bc1bcc3 > d > > acf64ccb5bde 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -635,18 +635,12 @@ (define_insn "aarch64_usdot" > > ;; and so the vectorizer provides r, in which the result has to be > accumulated. > > (define_expand "dot_prod" > > [(set (match_operand:VS 0 "register_operand") > > - (plus:VS (unspec:VS [(match_operand: 1 > "register_operand") > > + (plus:VS (match_operand:VS 3 "register_operand") > > + (unspec:VS [(match_operand: 1 > "register_operand") > > (match_operand: 2 "register_operand")] > > - DOTPROD) > > - (match_operand:VS 3 "register_operand")))] > > + DOTPROD)))] > > "TARGET_DOTPROD" > > The canonical plus: operand order was the original one, so I think it would be > better to keep this rtl as-is and instead change aarch64_dot to: > > (plus:VS (unspec:VS [(match_operand: 2 > "register_operand" "w") > (match_operand: 3 "register_operand" > "w")] > DOTPROD) > (match_operand:VS 1 "register_operand" "0")) > > Same idea for aarch64_dot_lane and > aarch64_dot_laneq. > > Sorry to be awkward… > > Thanks, > Richard > > > -{ > > - emit_insn ( > > - gen_aarch64_dot (operands[3], operands[3], operands[1], > > - operands[2])); > > - emit_insn (gen_rtx_SET (operands[0], operands[3])); > > - DONE; > > -}) > > +) > > > > ;; Auto-vectorizer pattern for usdot. The operand[3] and operand[0] > > are the ;; RMW parameters that when it comes to the vectorizer.