From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id F3FA8385840D; Sat, 2 Dec 2023 11:42:23 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org F3FA8385840D DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1701517344; bh=3T4SZm8+09XW77SUMXzYCG4q6O+AuCHkyPOt33HQGho=; h=From:To:Subject:Date:From; b=hv8eGOVRGeg8EAN5xsIvSi4LOVB+fK/bGQwzil97j80mKKoiDhL7UCXLgFhbh/nfe DIxGRlRmg1lj8hDp8AsOqpAfNo2pZU77uZiwzTReZJnNjvtlgWtM+pTKRs6cbtad7z yEhoD40V/epgc//LHsThTlqiVVhY//gkYETP3Jwg= From: "elrodc at gmail dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins Date: Sat, 02 Dec 2023 11:42:22 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: 14.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: elrodc at gmail dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D112824 Bug ID: 112824 Summary: Stack spills and vector splitting with vector builtins Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: elrodc at gmail dot com Target Milestone: --- I am not sure which component to place this under, but selected tree-optimization as I suspect this is some sort of alias analysis failure preventing the removal of stack allocations. Godbolt link, reproduces on GCC trunk and 13.2: https://godbolt.org/z/4TPx17Mbn Clang has similar problems in my actual test case, but they don't show up in this minimal example I made. Although Clang isn't perfect here either: it f= ails to fuse fmadd + masked vmovapd, while GCC does succeed in fusing them. For reference, code behind the godbolt link is: #include #include #include #include template using Vec [[gnu::vector_size(W * sizeof(T))]] =3D T; // Omitted: 16 without AVX, 32 without AVX512F, // or for forward compatibility some AVX10 may also mean 32-only static constexpr ptrdiff_t VectorBytes =3D 64; template static constexpr ptrdiff_t VecWidth =3D 64 <=3D sizeof(T) ? 1 : 64/sizeof(T= ); template struct Vector{ static constexpr ptrdiff_t L =3D N; T data[L]; static constexpr auto size()->ptrdiff_t{return N;} }; template struct Vector{ static constexpr ptrdiff_t W =3D N >=3D VecWidth ? VecWidth : ptrdiff_t(std::bit_ceil(size_t(N)));=20 static constexpr ptrdiff_t L =3D (N/W) + ((N%W)!=3D0); using V =3D Vec; V data[L]; static constexpr auto size()->ptrdiff_t{return N;} }; /// should be trivially copyable /// codegen is worse when passing by value, even though it seems like it sh= ould make /// aliasing simpler to analyze? template [[gnu::always_inline]] constexpr auto operator+(Vector x, Vector = y) -> Vector { Vector z; for (ptrdiff_t n =3D 0; n < Vector::L; ++n) z.data[n] =3D x.data[n= ] + y.data[n]; return z; } template [[gnu::always_inline]] constexpr auto operator*(Vector x, Vector = y) -> Vector { Vector z; for (ptrdiff_t n =3D 0; n < Vector::L; ++n) z.data[n] =3D x.data[n= ] * y.data[n]; return z; } template [[gnu::always_inline]] constexpr auto operator+(T x, Vector y) -> Vector { Vector z; for (ptrdiff_t n =3D 0; n < Vector::L; ++n) z.data[n] =3D x + y.da= ta[n]; return z; } template [[gnu::always_inline]] constexpr auto operator*(T x, Vector y) -> Vector { Vector z; for (ptrdiff_t n =3D 0; n < Vector::L; ++n) z.data[n] =3D x * y.da= ta[n]; return z; } template struct Dual { T value; Vector partials; }; // Here we have a specialization for non-power-of-2 `N` template =20 requires(std::floating_point && (std::popcount(size_t(N))>1)) struct Dual { Vector data; }; template consteval auto firstoff(){ static_assert(std::same_as, "type not implemented"); if constexpr (W=3D=3D2) return Vec<2,int64_t>{0,1} !=3D 0; else if constexpr (W =3D=3D 4) return Vec<4,int64_t>{0,1,2,3} !=3D 0; else if constexpr (W =3D=3D 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != =3D 0; else static_assert(false, "vector width not implemented"); } template [[gnu::always_inline]] constexpr auto operator+(Dual a, Dual b) -> Dual { if constexpr (std::floating_point && (std::popcount(size_t(N))>1)){ Dual c; for (ptrdiff_t l =3D 0; l < Vector::L; ++l) c.data.data[l] =3D a.data.data[l] + b.data.data[l];=20 return c; } else return {a.value + b.value, a.partials + b.partials}; } template [[gnu::always_inline]] constexpr auto operator*(Dual a, Dual b) -> Dual { if constexpr (std::floating_point && (std::popcount(size_t(N))>1)){ using V =3D typename Vector::V; V va =3D V{}+a.data.data[0][0], vb =3D V{}+b.data.data[0][0]; V x =3D va * b.data.data[0]; Dual c; c.data.data[0] =3D firstoff::W,T>() ? x + vb*a.data.data[0]= : x; for (ptrdiff_t l =3D 1; l < Vector::L; ++l) c.data.data[l] =3D va*b.data.data[l] + vb*a.data.data[l];=20 return c; } else return {a.value * b.value, a.value * b.partials + b.value * a.partials}; } void prod(Dual,2> &c, const Dual,2> &a, const Dual,2>&b){ c =3D a*b; } void prod(Dual,2> &c, const Dual,2> &a, const Dual,2>&b){ c =3D a*b; } GCC 13.2 asm, when compiling with -std=3Dgnu++23 -march=3Dskylake-avx512 -mprefer-vector-width=3D512 -O3 prod(Dual, 2l>&, Dual, 2l> const&, Dual, 2l> const&): push rbp mov eax, -2 kmovb k1, eax mov rbp, rsp and rsp, -64 sub rsp, 264 vmovdqa ymm4, YMMWORD PTR [rsi+128] vmovapd zmm8, ZMMWORD PTR [rsi] vmovapd zmm9, ZMMWORD PTR [rdx] vmovdqa ymm6, YMMWORD PTR [rsi+64] vmovdqa YMMWORD PTR [rsp+8], ymm4 vmovdqa ymm4, YMMWORD PTR [rdx+96] vbroadcastsd zmm0, xmm8 vmovdqa ymm7, YMMWORD PTR [rsi+96] vbroadcastsd zmm1, xmm9 vmovdqa YMMWORD PTR [rsp-56], ymm6 vmovdqa ymm5, YMMWORD PTR [rdx+128] vmovdqa ymm6, YMMWORD PTR [rsi+160] vmovdqa YMMWORD PTR [rsp+168], ymm4 vxorpd xmm4, xmm4, xmm4 vaddpd zmm0, zmm0, zmm4 vaddpd zmm1, zmm1, zmm4 vmovdqa YMMWORD PTR [rsp-24], ymm7 vmovdqa ymm7, YMMWORD PTR [rdx+64] vmovapd zmm3, ZMMWORD PTR [rsp-56] vmovdqa YMMWORD PTR [rsp+40], ymm6 vmovdqa ymm6, YMMWORD PTR [rdx+160] vmovdqa YMMWORD PTR [rsp+200], ymm5 vmulpd zmm2, zmm0, zmm9 vmovdqa YMMWORD PTR [rsp+136], ymm7 vmulpd zmm5, zmm1, zmm3 vbroadcastsd zmm3, xmm3 vmovdqa YMMWORD PTR [rsp+232], ymm6 vaddpd zmm3, zmm3, zmm4 vmovapd zmm7, zmm2 vmovapd zmm2, ZMMWORD PTR [rsp+8] vfmadd231pd zmm7{k1}, zmm8, zmm1 vmovapd zmm6, zmm5 vmovapd zmm5, ZMMWORD PTR [rsp+136] vmulpd zmm1, zmm1, zmm2 vfmadd231pd zmm6{k1}, zmm9, zmm3 vbroadcastsd zmm2, xmm2 vmovapd zmm3, ZMMWORD PTR [rsp+200] vaddpd zmm2, zmm2, zmm4 vmovapd ZMMWORD PTR [rdi], zmm7 vfmadd231pd zmm1{k1}, zmm9, zmm2 vmulpd zmm2, zmm0, zmm5 vbroadcastsd zmm5, xmm5 vmulpd zmm0, zmm0, zmm3 vbroadcastsd zmm3, xmm3 vaddpd zmm5, zmm5, zmm4 vaddpd zmm3, zmm3, zmm4 vfmadd231pd zmm2{k1}, zmm8, zmm5 vfmadd231pd zmm0{k1}, zmm8, zmm3 vaddpd zmm2, zmm2, zmm6 vaddpd zmm0, zmm0, zmm1 vmovapd ZMMWORD PTR [rdi+64], zmm2 vmovapd ZMMWORD PTR [rdi+128], zmm0 vzeroupper leave ret prod(Dual, 2l>&, Dual, 2l> const&, Dual, 2l> const&): push rbp mov rbp, rsp and rsp, -64 sub rsp, 648 vmovdqa ymm5, YMMWORD PTR [rsi+224] vmovdqa ymm3, YMMWORD PTR [rsi+352] vmovapd zmm0, ZMMWORD PTR [rdx+64] vmovdqa ymm2, YMMWORD PTR [rsi+320] vmovdqa YMMWORD PTR [rsp+104], ymm5 vmovdqa ymm5, YMMWORD PTR [rdx+224] vmovdqa ymm7, YMMWORD PTR [rsi+128] vmovdqa YMMWORD PTR [rsp+232], ymm3 vmovsd xmm3, QWORD PTR [rsi] vmovdqa ymm6, YMMWORD PTR [rsi+192] vmovdqa YMMWORD PTR [rsp+488], ymm5 vmovdqa ymm4, YMMWORD PTR [rdx+192] vmovapd zmm1, ZMMWORD PTR [rsi+64] vbroadcastsd zmm5, xmm3 vmovdqa YMMWORD PTR [rsp+200], ymm2 vmovdqa ymm2, YMMWORD PTR [rdx+320] vmulpd zmm8, zmm5, zmm0 vmovdqa YMMWORD PTR [rsp+8], ymm7 vmovdqa ymm7, YMMWORD PTR [rsi+256] vmovdqa YMMWORD PTR [rsp+72], ymm6 vmovdqa ymm6, YMMWORD PTR [rdx+128] vmovdqa YMMWORD PTR [rsp+584], ymm2 vmovsd xmm2, QWORD PTR [rdx] vmovdqa YMMWORD PTR [rsp+136], ymm7 vmovdqa ymm7, YMMWORD PTR [rdx+256] vmovdqa YMMWORD PTR [rsp+392], ymm6 vmovdqa ymm6, YMMWORD PTR [rdx+352] vmulsd xmm10, xmm3, xmm2 vmovdqa YMMWORD PTR [rsp+456], ymm4 vbroadcastsd zmm4, xmm2 vfmadd231pd zmm8, zmm4, zmm1 vmovdqa YMMWORD PTR [rsp+520], ymm7 vmovdqa YMMWORD PTR [rsp+616], ymm6 vmulpd zmm9, zmm4, ZMMWORD PTR [rsp+72] vmovsd xmm6, QWORD PTR [rsp+520] vmulpd zmm4, zmm4, ZMMWORD PTR [rsp+200] vmulpd zmm11, zmm5, ZMMWORD PTR [rsp+456] vmovsd QWORD PTR [rdi], xmm10 vmulpd zmm5, zmm5, ZMMWORD PTR [rsp+584] vmovapd ZMMWORD PTR [rdi+64], zmm8 vfmadd231pd zmm9, zmm0, QWORD PTR [rsp+8]{1to8} vfmadd231pd zmm4, zmm0, QWORD PTR [rsp+136]{1to8} vmovsd xmm0, QWORD PTR [rsp+392] vmulsd xmm7, xmm3, xmm0 vbroadcastsd zmm0, xmm0 vmulsd xmm3, xmm3, xmm6 vfmadd132pd zmm0, zmm11, zmm1 vbroadcastsd zmm6, xmm6 vfmadd132pd zmm1, zmm5, zmm6 vfmadd231sd xmm7, xmm2, QWORD PTR [rsp+8] vfmadd132sd xmm2, xmm3, QWORD PTR [rsp+136] vaddpd zmm0, zmm0, zmm9 vaddpd zmm1, zmm1, zmm4 vmovapd ZMMWORD PTR [rdi+192], zmm0 vmovsd QWORD PTR [rdi+128], xmm7 vmovsd QWORD PTR [rdi+256], xmm2 vmovapd ZMMWORD PTR [rdi+320], zmm1 vzeroupper leave ret Note all the stores to/loads from rsp, and the use of ymm registers.=