public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins
@ 2023-12-02 11:42 elrodc at gmail dot com
2023-12-02 13:20 ` [Bug tree-optimization/112824] " elrodc at gmail dot com
` (9 more replies)
0 siblings, 10 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-02 11:42 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
Bug ID: 112824
Summary: Stack spills and vector splitting with vector builtins
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: elrodc at gmail dot com
Target Milestone: ---
I am not sure which component to place this under, but selected
tree-optimization as I suspect this is some sort of alias analysis failure
preventing the removal of stack allocations.
Godbolt link, reproduces on GCC trunk and 13.2:
https://godbolt.org/z/4TPx17Mbn
Clang has similar problems in my actual test case, but they don't show up in
this minimal example I made. Although Clang isn't perfect here either: it fails
to fuse fmadd + masked vmovapd, while GCC does succeed in fusing them.
For reference, code behind the godbolt link is:
#include <bit>
#include <concepts>
#include <cstddef>
#include <cstdint>
template <ptrdiff_t W, typename T>
using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
// Omitted: 16 without AVX, 32 without AVX512F,
// or for forward compatibility some AVX10 may also mean 32-only
static constexpr ptrdiff_t VectorBytes = 64;
template<typename T>
static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T);
template <typename T, ptrdiff_t N> struct Vector{
static constexpr ptrdiff_t L = N;
T data[L];
static constexpr auto size()->ptrdiff_t{return N;}
};
template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{
static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> :
ptrdiff_t(std::bit_ceil(size_t(N)));
static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0);
using V = Vec<W,T>;
V data[L];
static constexpr auto size()->ptrdiff_t{return N;}
};
/// should be trivially copyable
/// codegen is worse when passing by value, even though it seems like it should
make
/// aliasing simpler to analyze?
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y)
-> Vector<T,N> {
Vector<T,N> z;
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] +
y.data[n];
return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y)
-> Vector<T,N> {
Vector<T,N> z;
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] *
y.data[n];
return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) ->
Vector<T,N> {
Vector<T,N> z;
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n];
return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) ->
Vector<T,N> {
Vector<T,N> z;
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n];
return z;
}
template <typename T, ptrdiff_t N> struct Dual {
T value;
Vector<T, N> partials;
};
// Here we have a specialization for non-power-of-2 `N`
template <typename T, ptrdiff_t N>
requires(std::floating_point<T> && (std::popcount(size_t(N))>1))
struct Dual<T,N> {
Vector<T, N+1> data;
};
template<ptrdiff_t W, typename T>
consteval auto firstoff(){
static_assert(std::same_as<T,double>, "type not implemented");
if constexpr (W==2) return Vec<2,int64_t>{0,1} != 0;
else if constexpr (W == 4) return Vec<4,int64_t>{0,1,2,3} != 0;
else if constexpr (W == 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != 0;
else static_assert(false, "vector width not implemented");
}
template <typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(Dual<T, N> a,
Dual<T, N> b)
-> Dual<T, N> {
if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
Dual<T,N> c;
for (ptrdiff_t l = 0; l < Vector<T,N>::L; ++l)
c.data.data[l] = a.data.data[l] + b.data.data[l];
return c;
} else return {a.value + b.value, a.partials + b.partials};
}
template <typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(Dual<T, N> a,
Dual<T, N> b)
-> Dual<T, N> {
if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
using V = typename Vector<T,N>::V;
V va = V{}+a.data.data[0][0], vb = V{}+b.data.data[0][0];
V x = va * b.data.data[0];
Dual<T,N> c;
c.data.data[0] = firstoff<Vector<T,N>::W,T>() ? x + vb*a.data.data[0] : x;
for (ptrdiff_t l = 1; l < Vector<T,N>::L; ++l)
c.data.data[l] = va*b.data.data[l] + vb*a.data.data[l];
return c;
} else return {a.value * b.value, a.value * b.partials + b.value *
a.partials};
}
void prod(Dual<Dual<double,7>,2> &c, const Dual<Dual<double,7>,2> &a, const
Dual<Dual<double,7>,2>&b){
c = a*b;
}
void prod(Dual<Dual<double,8>,2> &c, const Dual<Dual<double,8>,2> &a, const
Dual<Dual<double,8>,2>&b){
c = a*b;
}
GCC 13.2 asm, when compiling with
-std=gnu++23 -march=skylake-avx512 -mprefer-vector-width=512 -O3
prod(Dual<Dual<double, 7l>, 2l>&, Dual<Dual<double, 7l>, 2l> const&,
Dual<Dual<double, 7l>, 2l> const&):
push rbp
mov eax, -2
kmovb k1, eax
mov rbp, rsp
and rsp, -64
sub rsp, 264
vmovdqa ymm4, YMMWORD PTR [rsi+128]
vmovapd zmm8, ZMMWORD PTR [rsi]
vmovapd zmm9, ZMMWORD PTR [rdx]
vmovdqa ymm6, YMMWORD PTR [rsi+64]
vmovdqa YMMWORD PTR [rsp+8], ymm4
vmovdqa ymm4, YMMWORD PTR [rdx+96]
vbroadcastsd zmm0, xmm8
vmovdqa ymm7, YMMWORD PTR [rsi+96]
vbroadcastsd zmm1, xmm9
vmovdqa YMMWORD PTR [rsp-56], ymm6
vmovdqa ymm5, YMMWORD PTR [rdx+128]
vmovdqa ymm6, YMMWORD PTR [rsi+160]
vmovdqa YMMWORD PTR [rsp+168], ymm4
vxorpd xmm4, xmm4, xmm4
vaddpd zmm0, zmm0, zmm4
vaddpd zmm1, zmm1, zmm4
vmovdqa YMMWORD PTR [rsp-24], ymm7
vmovdqa ymm7, YMMWORD PTR [rdx+64]
vmovapd zmm3, ZMMWORD PTR [rsp-56]
vmovdqa YMMWORD PTR [rsp+40], ymm6
vmovdqa ymm6, YMMWORD PTR [rdx+160]
vmovdqa YMMWORD PTR [rsp+200], ymm5
vmulpd zmm2, zmm0, zmm9
vmovdqa YMMWORD PTR [rsp+136], ymm7
vmulpd zmm5, zmm1, zmm3
vbroadcastsd zmm3, xmm3
vmovdqa YMMWORD PTR [rsp+232], ymm6
vaddpd zmm3, zmm3, zmm4
vmovapd zmm7, zmm2
vmovapd zmm2, ZMMWORD PTR [rsp+8]
vfmadd231pd zmm7{k1}, zmm8, zmm1
vmovapd zmm6, zmm5
vmovapd zmm5, ZMMWORD PTR [rsp+136]
vmulpd zmm1, zmm1, zmm2
vfmadd231pd zmm6{k1}, zmm9, zmm3
vbroadcastsd zmm2, xmm2
vmovapd zmm3, ZMMWORD PTR [rsp+200]
vaddpd zmm2, zmm2, zmm4
vmovapd ZMMWORD PTR [rdi], zmm7
vfmadd231pd zmm1{k1}, zmm9, zmm2
vmulpd zmm2, zmm0, zmm5
vbroadcastsd zmm5, xmm5
vmulpd zmm0, zmm0, zmm3
vbroadcastsd zmm3, xmm3
vaddpd zmm5, zmm5, zmm4
vaddpd zmm3, zmm3, zmm4
vfmadd231pd zmm2{k1}, zmm8, zmm5
vfmadd231pd zmm0{k1}, zmm8, zmm3
vaddpd zmm2, zmm2, zmm6
vaddpd zmm0, zmm0, zmm1
vmovapd ZMMWORD PTR [rdi+64], zmm2
vmovapd ZMMWORD PTR [rdi+128], zmm0
vzeroupper
leave
ret
prod(Dual<Dual<double, 8l>, 2l>&, Dual<Dual<double, 8l>, 2l> const&,
Dual<Dual<double, 8l>, 2l> const&):
push rbp
mov rbp, rsp
and rsp, -64
sub rsp, 648
vmovdqa ymm5, YMMWORD PTR [rsi+224]
vmovdqa ymm3, YMMWORD PTR [rsi+352]
vmovapd zmm0, ZMMWORD PTR [rdx+64]
vmovdqa ymm2, YMMWORD PTR [rsi+320]
vmovdqa YMMWORD PTR [rsp+104], ymm5
vmovdqa ymm5, YMMWORD PTR [rdx+224]
vmovdqa ymm7, YMMWORD PTR [rsi+128]
vmovdqa YMMWORD PTR [rsp+232], ymm3
vmovsd xmm3, QWORD PTR [rsi]
vmovdqa ymm6, YMMWORD PTR [rsi+192]
vmovdqa YMMWORD PTR [rsp+488], ymm5
vmovdqa ymm4, YMMWORD PTR [rdx+192]
vmovapd zmm1, ZMMWORD PTR [rsi+64]
vbroadcastsd zmm5, xmm3
vmovdqa YMMWORD PTR [rsp+200], ymm2
vmovdqa ymm2, YMMWORD PTR [rdx+320]
vmulpd zmm8, zmm5, zmm0
vmovdqa YMMWORD PTR [rsp+8], ymm7
vmovdqa ymm7, YMMWORD PTR [rsi+256]
vmovdqa YMMWORD PTR [rsp+72], ymm6
vmovdqa ymm6, YMMWORD PTR [rdx+128]
vmovdqa YMMWORD PTR [rsp+584], ymm2
vmovsd xmm2, QWORD PTR [rdx]
vmovdqa YMMWORD PTR [rsp+136], ymm7
vmovdqa ymm7, YMMWORD PTR [rdx+256]
vmovdqa YMMWORD PTR [rsp+392], ymm6
vmovdqa ymm6, YMMWORD PTR [rdx+352]
vmulsd xmm10, xmm3, xmm2
vmovdqa YMMWORD PTR [rsp+456], ymm4
vbroadcastsd zmm4, xmm2
vfmadd231pd zmm8, zmm4, zmm1
vmovdqa YMMWORD PTR [rsp+520], ymm7
vmovdqa YMMWORD PTR [rsp+616], ymm6
vmulpd zmm9, zmm4, ZMMWORD PTR [rsp+72]
vmovsd xmm6, QWORD PTR [rsp+520]
vmulpd zmm4, zmm4, ZMMWORD PTR [rsp+200]
vmulpd zmm11, zmm5, ZMMWORD PTR [rsp+456]
vmovsd QWORD PTR [rdi], xmm10
vmulpd zmm5, zmm5, ZMMWORD PTR [rsp+584]
vmovapd ZMMWORD PTR [rdi+64], zmm8
vfmadd231pd zmm9, zmm0, QWORD PTR [rsp+8]{1to8}
vfmadd231pd zmm4, zmm0, QWORD PTR [rsp+136]{1to8}
vmovsd xmm0, QWORD PTR [rsp+392]
vmulsd xmm7, xmm3, xmm0
vbroadcastsd zmm0, xmm0
vmulsd xmm3, xmm3, xmm6
vfmadd132pd zmm0, zmm11, zmm1
vbroadcastsd zmm6, xmm6
vfmadd132pd zmm1, zmm5, zmm6
vfmadd231sd xmm7, xmm2, QWORD PTR [rsp+8]
vfmadd132sd xmm2, xmm3, QWORD PTR [rsp+136]
vaddpd zmm0, zmm0, zmm9
vaddpd zmm1, zmm1, zmm4
vmovapd ZMMWORD PTR [rdi+192], zmm0
vmovsd QWORD PTR [rdi+128], xmm7
vmovsd QWORD PTR [rdi+256], xmm2
vmovapd ZMMWORD PTR [rdi+320], zmm1
vzeroupper
leave
ret
Note all the stores to/loads from rsp, and the use of ymm registers.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug tree-optimization/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
@ 2023-12-02 13:20 ` elrodc at gmail dot com
2023-12-03 13:29 ` [Bug middle-end/112824] " elrodc at gmail dot com
` (8 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-02 13:20 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
--- Comment #1 from Chris Elrod <elrodc at gmail dot com> ---
Here I have added a godbolt example where I manually unroll the array, where
GCC generates excellent code https://godbolt.org/z/sd4bhGW7e
I'm not sure it is 100% optimal, but with an inner Dual size of `7`, on
Skylake-X it is 38 uops for unrolled GCC with separate struct fields, vs 49
uops for Clang, vs 67 for GCC with arrays.
uica expects <14 clock cycles for the manually unrolled vs >23 for the array
version.
My experience so far with expression templates has born this out: compilers
seem to struggle with peeling away abstractions.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
2023-12-02 13:20 ` [Bug tree-optimization/112824] " elrodc at gmail dot com
@ 2023-12-03 13:29 ` elrodc at gmail dot com
2023-12-03 13:58 ` elrodc at gmail dot com
` (7 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-03 13:29 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
--- Comment #2 from Chris Elrod <elrodc at gmail dot com> ---
https://godbolt.org/z/3648aMTz8
Perhaps a simpler diff is that you can reproduce by uncommenting the pragma,
but codegen becomes good with it.
template<typename T, ptrdiff_t N>
constexpr auto operator*(OuterDualUA2<T,N> a, OuterDualUA2<T,N>
b)->OuterDualUA2<T,N>{
//return
{a.value*b.value,a.value*b.p[0]+b.value*a.p[0],a.value*b.p[1]+b.value*a.p[1]};
OuterDualUA2<T,N> c;
c.value = a.value*b.value;
#pragma GCC unroll 16
for (ptrdiff_t i = 0; i < 2; ++i)
c.p[i] = a.value*b.p[i] + b.value*a.p[i];
//c.p[0] = a.value*b.p[0] + b.value*a.p[0];
//c.p[1] = a.value*b.p[1] + b.value*a.p[1];
return c;
}
It's not great to have to add pragmas everywhere to my actual codebase. I
thought I hit the important cases, but my non-minimal example still gets
unnecessary register splits and stack spills, so maybe I missed places, or
perhaps there's another issue.
Given that GCC unrolls the above code even without the pragma, it seems like a
definite bug that the pragma is needed for the resulting code generation to
actually be good.
Not knowing the compiler pipeline, my naive guess is that the pragma causes
earlier unrolling than whatever optimization pass does it sans pragma, and that
some important analysis/optimization gets run between those two times.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
2023-12-02 13:20 ` [Bug tree-optimization/112824] " elrodc at gmail dot com
2023-12-03 13:29 ` [Bug middle-end/112824] " elrodc at gmail dot com
@ 2023-12-03 13:58 ` elrodc at gmail dot com
2023-12-04 3:42 ` liuhongt at gcc dot gnu.org
` (6 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-03 13:58 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
--- Comment #3 from Chris Elrod <elrodc at gmail dot com> ---
> I thought I hit the important cases, but my non-minimal example still gets unnecessary register splits and stack spills, so maybe I missed places, or perhaps there's another issue.
Adding the unroll pragma to the `Vector`'s operator + and *:
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y)
-> Vector<T,N> {
Vector<T,N> z;
#pragma GCC unroll 16
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] +
y.data[n];
return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y)
-> Vector<T,N> {
Vector<T,N> z;
#pragma GCC unroll 16
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] *
y.data[n];
return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) ->
Vector<T,N> {
Vector<T,N> z;
#pragma GCC unroll 16
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n];
return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) ->
Vector<T,N> {
Vector<T,N> z;
#pragma GCC unroll 16
for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n];
return z;
}
does not improve code generation (still get the same problem), so that's a
reproducer for such an issue.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
` (2 preceding siblings ...)
2023-12-03 13:58 ` elrodc at gmail dot com
@ 2023-12-04 3:42 ` liuhongt at gcc dot gnu.org
2023-12-04 10:04 ` rguenth at gcc dot gnu.org
` (5 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: liuhongt at gcc dot gnu.org @ 2023-12-04 3:42 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
liuhongt at gcc dot gnu.org changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |liuhongt at gcc dot gnu.org
--- Comment #4 from liuhongt at gcc dot gnu.org ---
there're 2 reasons.
1. X86_TUNE_AVX512_MOVE_BY_PIECES is used for move struct but it's independent
of -mprefer-vector-width=512, manully add
-mtune-ctrl=-mtune-ctrl=avx512_move_by_pieces can solve most cases.
2. There's still spills for (subreg:DF (reg: V8DF) since ix86_modes_tieable_p
return false for DF and V8DF.
For 1, I guess when there's explicit -mprefer-vector-width=512, it should
rewrite tune X86_TUNE_AVX512_MOVE_BY_PIECES and unset
X86_TUNE_AVX256_OPTIMAL/X86_TUNE_AVX128_OPTIMAL
For 2, according to doc, it should return false since DF can be allocated to
X87 reg, but V8DF only SSE_REGS, so the fix should be using vec_select instead
of subreg for that case?
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
` (3 preceding siblings ...)
2023-12-04 3:42 ` liuhongt at gcc dot gnu.org
@ 2023-12-04 10:04 ` rguenth at gcc dot gnu.org
2023-12-04 16:43 ` elrodc at gmail dot com
` (4 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-12-04 10:04 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Ever confirmed|0 |1
Last reconfirmed| |2023-12-04
Status|UNCONFIRMED |NEW
--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
On GIMPLE there's nothing wrong I think. There's some inconsistency in
representing a splat where when we have a scalar source we use a CONSTRUCTOR
but when we have a (multi-use) vector we use a VEC_PERM to splat its
first element (but I guess that's actually good in the end).
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
` (4 preceding siblings ...)
2023-12-04 10:04 ` rguenth at gcc dot gnu.org
@ 2023-12-04 16:43 ` elrodc at gmail dot com
2023-12-05 3:44 ` liuhongt at gcc dot gnu.org
` (3 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-04 16:43 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
--- Comment #6 from Chris Elrod <elrodc at gmail dot com> ---
Hongtao Liu, I do think that one should ideally be able to get optimal codegen
when using 512-bit builtin vectors or vector intrinsics, without needing to set
`-mprefer-vector-width=512` (and, currently, also setting
`-mtune-ctrl=avx512_move_by_pieces`).
For example, if I remove `-mprefer-vector-width=512`, I get
prod(Dual<Dual<double, 7l>, 2l>&, Dual<Dual<double, 7l>, 2l> const&,
Dual<Dual<double, 7l>, 2l> const&):
push rbp
mov eax, -2
kmovb k1, eax
mov rbp, rsp
and rsp, -64
sub rsp, 264
vmovdqa ymm4, YMMWORD PTR [rsi+128]
vmovapd zmm8, ZMMWORD PTR [rsi]
vmovapd zmm9, ZMMWORD PTR [rdx]
vmovdqa ymm6, YMMWORD PTR [rsi+64]
vmovdqa YMMWORD PTR [rsp+8], ymm4
vmovdqa ymm4, YMMWORD PTR [rdx+96]
vbroadcastsd zmm0, xmm8
vmovdqa ymm7, YMMWORD PTR [rsi+96]
vbroadcastsd zmm1, xmm9
vmovdqa YMMWORD PTR [rsp-56], ymm6
vmovdqa ymm5, YMMWORD PTR [rdx+128]
vmovdqa ymm6, YMMWORD PTR [rsi+160]
vmovdqa YMMWORD PTR [rsp+168], ymm4
vxorpd xmm4, xmm4, xmm4
vaddpd zmm0, zmm0, zmm4
vaddpd zmm1, zmm1, zmm4
vmovdqa YMMWORD PTR [rsp-24], ymm7
vmovdqa ymm7, YMMWORD PTR [rdx+64]
vmovapd zmm3, ZMMWORD PTR [rsp-56]
vmovdqa YMMWORD PTR [rsp+40], ymm6
vmovdqa ymm6, YMMWORD PTR [rdx+160]
vmovdqa YMMWORD PTR [rsp+200], ymm5
vmulpd zmm2, zmm0, zmm9
vmovdqa YMMWORD PTR [rsp+136], ymm7
vmulpd zmm5, zmm1, zmm3
vbroadcastsd zmm3, xmm3
vmovdqa YMMWORD PTR [rsp+232], ymm6
vaddpd zmm3, zmm3, zmm4
vmovapd zmm7, zmm2
vmovapd zmm2, ZMMWORD PTR [rsp+8]
vfmadd231pd zmm7{k1}, zmm8, zmm1
vmovapd zmm6, zmm5
vmovapd zmm5, ZMMWORD PTR [rsp+136]
vmulpd zmm1, zmm1, zmm2
vfmadd231pd zmm6{k1}, zmm9, zmm3
vbroadcastsd zmm2, xmm2
vmovapd zmm3, ZMMWORD PTR [rsp+200]
vaddpd zmm2, zmm2, zmm4
vmovapd ZMMWORD PTR [rdi], zmm7
vfmadd231pd zmm1{k1}, zmm9, zmm2
vmulpd zmm2, zmm0, zmm5
vbroadcastsd zmm5, xmm5
vmulpd zmm0, zmm0, zmm3
vbroadcastsd zmm3, xmm3
vaddpd zmm5, zmm5, zmm4
vaddpd zmm3, zmm3, zmm4
vfmadd231pd zmm2{k1}, zmm8, zmm5
vfmadd231pd zmm0{k1}, zmm8, zmm3
vaddpd zmm2, zmm2, zmm6
vaddpd zmm0, zmm0, zmm1
vmovapd ZMMWORD PTR [rdi+64], zmm2
vmovapd ZMMWORD PTR [rdi+128], zmm0
vzeroupper
leave
ret
prod(Dual<Dual<double, 8l>, 2l>&, Dual<Dual<double, 8l>, 2l> const&,
Dual<Dual<double, 8l>, 2l> const&):
push rbp
mov rbp, rsp
and rsp, -64
sub rsp, 648
vmovdqa ymm5, YMMWORD PTR [rsi+224]
vmovdqa ymm3, YMMWORD PTR [rsi+352]
vmovapd zmm0, ZMMWORD PTR [rdx+64]
vmovdqa ymm2, YMMWORD PTR [rsi+320]
vmovdqa YMMWORD PTR [rsp+104], ymm5
vmovdqa ymm5, YMMWORD PTR [rdx+224]
vmovdqa ymm7, YMMWORD PTR [rsi+128]
vmovdqa YMMWORD PTR [rsp+232], ymm3
vmovsd xmm3, QWORD PTR [rsi]
vmovdqa ymm6, YMMWORD PTR [rsi+192]
vmovdqa YMMWORD PTR [rsp+488], ymm5
vmovdqa ymm4, YMMWORD PTR [rdx+192]
vmovapd zmm1, ZMMWORD PTR [rsi+64]
vbroadcastsd zmm5, xmm3
vmovdqa YMMWORD PTR [rsp+200], ymm2
vmovdqa ymm2, YMMWORD PTR [rdx+320]
vmulpd zmm8, zmm5, zmm0
vmovdqa YMMWORD PTR [rsp+8], ymm7
vmovdqa ymm7, YMMWORD PTR [rsi+256]
vmovdqa YMMWORD PTR [rsp+72], ymm6
vmovdqa ymm6, YMMWORD PTR [rdx+128]
vmovdqa YMMWORD PTR [rsp+584], ymm2
vmovsd xmm2, QWORD PTR [rdx]
vmovdqa YMMWORD PTR [rsp+136], ymm7
vmovdqa ymm7, YMMWORD PTR [rdx+256]
vmovdqa YMMWORD PTR [rsp+392], ymm6
vmovdqa ymm6, YMMWORD PTR [rdx+352]
vmulsd xmm10, xmm3, xmm2
vmovdqa YMMWORD PTR [rsp+456], ymm4
vbroadcastsd zmm4, xmm2
vfmadd231pd zmm8, zmm4, zmm1
vmovdqa YMMWORD PTR [rsp+520], ymm7
vmovdqa YMMWORD PTR [rsp+616], ymm6
vmulpd zmm9, zmm4, ZMMWORD PTR [rsp+72]
vmovsd xmm6, QWORD PTR [rsp+520]
vmulpd zmm4, zmm4, ZMMWORD PTR [rsp+200]
vmulpd zmm11, zmm5, ZMMWORD PTR [rsp+456]
vmovsd QWORD PTR [rdi], xmm10
vmulpd zmm5, zmm5, ZMMWORD PTR [rsp+584]
vmovapd ZMMWORD PTR [rdi+64], zmm8
vfmadd231pd zmm9, zmm0, QWORD PTR [rsp+8]{1to8}
vfmadd231pd zmm4, zmm0, QWORD PTR [rsp+136]{1to8}
vmovsd xmm0, QWORD PTR [rsp+392]
vmulsd xmm7, xmm3, xmm0
vbroadcastsd zmm0, xmm0
vmulsd xmm3, xmm3, xmm6
vfmadd132pd zmm0, zmm11, zmm1
vbroadcastsd zmm6, xmm6
vfmadd132pd zmm1, zmm5, zmm6
vfmadd231sd xmm7, xmm2, QWORD PTR [rsp+8]
vfmadd132sd xmm2, xmm3, QWORD PTR [rsp+136]
vaddpd zmm0, zmm0, zmm9
vaddpd zmm1, zmm1, zmm4
vmovapd ZMMWORD PTR [rdi+192], zmm0
vmovsd QWORD PTR [rdi+128], xmm7
vmovsd QWORD PTR [rdi+256], xmm2
vmovapd ZMMWORD PTR [rdi+320], zmm1
vzeroupper
leave
ret
GCC respects the vector builtins and uses 512 bit ops, but then does splits and
spills across function boundaries.
So, what I'm arguing is, while it would be great to respect
`-mprefer-vector-width=512`, it should ideally also be able to respect vector
builtins/intrinsics, so that one can use full width vectors without also having
to set `-mprefer-vector-width=512 -mtune-control=avx512_move_by_pieces`.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
` (5 preceding siblings ...)
2023-12-04 16:43 ` elrodc at gmail dot com
@ 2023-12-05 3:44 ` liuhongt at gcc dot gnu.org
2023-12-05 5:06 ` elrodc at gmail dot com
` (2 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: liuhongt at gcc dot gnu.org @ 2023-12-05 3:44 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
--- Comment #7 from Hongtao Liu <liuhongt at gcc dot gnu.org> ---
(In reply to Chris Elrod from comment #6)
> Hongtao Liu, I do think that one should ideally be able to get optimal
> codegen when using 512-bit builtin vectors or vector intrinsics, without
> needing to set `-mprefer-vector-width=512` (and, currently, also setting
> `-mtune-ctrl=avx512_move_by_pieces`).
>
>
> GCC respects the vector builtins and uses 512 bit ops, but then does splits
> and spills across function boundaries.
> So, what I'm arguing is, while it would be great to respect
> `-mprefer-vector-width=512`, it should ideally also be able to respect
> vector builtins/intrinsics, so that one can use full width vectors without
> also having to set `-mprefer-vector-width=512
> -mtune-control=avx512_move_by_pieces`.
If it's designed the way you want it to be, another issue would be like, should
we lower 512-bit vector builtins/intrinsic to ymm/xmm when
-mprefer-vector-width=256, the answer is we'd rather not. The intrinsic should
be closer to a one-to-one correspondence of instructions.(Though there're
several instrinics which are lowered to a sequence of instructions)
There're also others users using 512-bit intriniscs for specific kernel loops,
but still want compiler to generate xmm/ymm for other codes,
-mprefer-vector-width=256.
Originally -mprefer-vector-width=XXX is designed for auto-vectorization, and
-mtune-ctrl=avx512_move_by_pieces is for memory movement. Both of which are
orthogonal to codegen for builtin, intrinsic or explicit vector types. If user
explicitly use 512-bit vector type, builtins or intrinsics, gcc will generate
zmm no matter -mprefer-vector-width=.
And yes, there could be some mismatches between 512-bit intrinsic and
architecture tuning when you're using 512-bit intrinsic, and also rely on
compiler autogen to handle struct
(struct Dual<T,N> {
Vector<T, N+1> data;
};).
For such case, an explicit -mprefer-vector-width=512 is needed.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
` (6 preceding siblings ...)
2023-12-05 3:44 ` liuhongt at gcc dot gnu.org
@ 2023-12-05 5:06 ` elrodc at gmail dot com
2023-12-07 7:21 ` wwwhhhyyy333 at gmail dot com
2023-12-15 2:36 ` cvs-commit at gcc dot gnu.org
9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-05 5:06 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
--- Comment #8 from Chris Elrod <elrodc at gmail dot com> ---
> If it's designed the way you want it to be, another issue would be like, should we lower 512-bit vector builtins/intrinsic to ymm/xmm when -mprefer-vector-width=256, the answer is we'd rather not.
To be clear, what I meant by
> it would be great to respect
> `-mprefer-vector-width=512`, it should ideally also be able to respect
> vector builtins/intrinsics
is that when someone uses 512 bit vector builtins, that codegen should generate
512 bit code regardless of `mprefer-vector-width` settings.
That is, as a developer, I would want 512 bit builtins to mean we get 512-bit
vector code generation.
> If user explicitly use 512-bit vector type, builtins or intrinsics, gcc will generate zmm no matter -mprefer-vector-width=.
This is what I would want, and I'd also want it to apply to movement of
`struct`s holding vector builtin objects, instead of the `ymm` usage as we see
here.
> And yes, there could be some mismatches between 512-bit intrinsic and architecture tuning when you're using 512-bit intrinsic, and also rely on compiler autogen to handle struct
> For such case, an explicit -mprefer-vector-width=512 is needed.
Note the template partial specialization
template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{
static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> :
ptrdiff_t(std::bit_ceil(size_t(N)));
static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0);
using V = Vec<W,T>;
V data[L];
static constexpr auto size()->ptrdiff_t{return N;}
};
Thus, `Vector`s in this example may explicitly be structs containing arrays of
vector builtins. I would expect these structs to not need an
`mprefer-vector-width=512` setting for producing 512 bit code handling this
struct.
Given small `L`, I would also expect passing this struct as an argument by
value to a non-inlined function to be done in `zmm` registers when possible,
for example.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
` (7 preceding siblings ...)
2023-12-05 5:06 ` elrodc at gmail dot com
@ 2023-12-07 7:21 ` wwwhhhyyy333 at gmail dot com
2023-12-15 2:36 ` cvs-commit at gcc dot gnu.org
9 siblings, 0 replies; 11+ messages in thread
From: wwwhhhyyy333 at gmail dot com @ 2023-12-07 7:21 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
Hongyu Wang <wwwhhhyyy333 at gmail dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |wwwhhhyyy333 at gmail dot com
--- Comment #9 from Hongyu Wang <wwwhhhyyy333 at gmail dot com> ---
(In reply to Hongtao Liu from comment #4)
> there're 2 reasons.
> 2. There's still spills for (subreg:DF (reg: V8DF) since
> ix86_modes_tieable_p return false for DF and V8DF.
>
There could be some issue in sra that the aggregates are not properly
scalarized due to size limit.
The sra considers maximum aggregate size using move_ratio * UNITS_PER_WORD, but
here the aggregate Dual<Dual<double, 8l>, 2l> actually contains several V8DF
component that can be handled in zmm under avx512f.
Add --param sra-max-scalarization-size-Ospeed=2048 will eliminate those spills
So for sra we can consider using MOVE_MAX * move_ratio as the size limit for
Ospeed which represents real backend instruction count.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
` (8 preceding siblings ...)
2023-12-07 7:21 ` wwwhhhyyy333 at gmail dot com
@ 2023-12-15 2:36 ` cvs-commit at gcc dot gnu.org
9 siblings, 0 replies; 11+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2023-12-15 2:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
--- Comment #10 from GCC Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Hongyu Wang <hongyuw@gcc.gnu.org>:
https://gcc.gnu.org/g:a52940cfee0908aed0d2a674a451f6d99844444d
commit r14-6575-ga52940cfee0908aed0d2a674a451f6d99844444d
Author: Hongyu Wang <hongyu.wang@intel.com>
Date: Mon Dec 11 09:15:02 2023 +0800
i386: Sync move_max/store_max with prefer-vector-width [PR112824]
Currently move_max follows the tuning feature first, but ideally it
should sync with prefer-vector-width when it is explicitly set to keep
vector move and operation with same vector size.
gcc/ChangeLog:
PR target/112824
* config/i386/i386-options.cc (ix86_option_override_internal):
Sync ix86_move_max/ix86_store_max with prefer_vector_width when
it is explicitly set.
gcc/testsuite/ChangeLog:
PR target/112824
* gcc.target/i386/pieces-memset-45.c: Remove
-mprefer-vector-width=256.
* g++.target/i386/pr112824-1.C: New test.
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2023-12-15 2:36 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
2023-12-02 13:20 ` [Bug tree-optimization/112824] " elrodc at gmail dot com
2023-12-03 13:29 ` [Bug middle-end/112824] " elrodc at gmail dot com
2023-12-03 13:58 ` elrodc at gmail dot com
2023-12-04 3:42 ` liuhongt at gcc dot gnu.org
2023-12-04 10:04 ` rguenth at gcc dot gnu.org
2023-12-04 16:43 ` elrodc at gmail dot com
2023-12-05 3:44 ` liuhongt at gcc dot gnu.org
2023-12-05 5:06 ` elrodc at gmail dot com
2023-12-07 7:21 ` wwwhhhyyy333 at gmail dot com
2023-12-15 2:36 ` cvs-commit at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).