[Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins
@ 2023-12-02 11:42 elrodc at gmail dot com
  2023-12-02 13:20 ` [Bug tree-optimization/112824] " elrodc at gmail dot com
                   ` (9 more replies)
  0 siblings, 10 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-02 11:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

            Bug ID: 112824
           Summary: Stack spills and vector splitting with vector builtins
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: elrodc at gmail dot com
  Target Milestone: ---

I am not sure which component to place this under, but selected
tree-optimization as I suspect this is some sort of alias analysis failure
preventing the removal of stack allocations.

Godbolt link, reproduces on GCC trunk and 13.2:
https://godbolt.org/z/4TPx17Mbn
Clang has similar problems in my actual test case, but they don't show up in
this minimal example I made. Although Clang isn't perfect here either: it fails
to fuse fmadd + masked vmovapd, while GCC does succeed in fusing them.

For reference, code behind the godbolt link is:

#include <bit>
#include <concepts>
#include <cstddef>
#include <cstdint>

template <ptrdiff_t W, typename T>
using Vec [[gnu::vector_size(W * sizeof(T))]] = T;


// Omitted: 16 without AVX, 32 without AVX512F,
// or for forward compatibility some AVX10 may also mean 32-only
static constexpr ptrdiff_t VectorBytes = 64;
template<typename T>
static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T);

template <typename T, ptrdiff_t N> struct Vector{
    static constexpr ptrdiff_t L = N;
    T data[L];
    static constexpr auto size()->ptrdiff_t{return N;}
};
template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{
    static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> :
ptrdiff_t(std::bit_ceil(size_t(N))); 
    static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0);
    using V = Vec<W,T>;
    V data[L];
    static constexpr auto size()->ptrdiff_t{return N;}
};
/// should be trivially copyable
/// codegen is worse when passing by value, even though it seems like it should
make
/// aliasing simpler to analyze?
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y)
-> Vector<T,N> {
    Vector<T,N> z;
    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] +
y.data[n];
    return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y)
-> Vector<T,N> {
    Vector<T,N> z;
    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] *
y.data[n];
    return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) ->
Vector<T,N> {
    Vector<T,N> z;
    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n];
    return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) ->
Vector<T,N> {
    Vector<T,N> z;
    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n];
    return z;
}



template <typename T, ptrdiff_t N> struct Dual {
  T value;
  Vector<T, N> partials;
};
// Here we have a specialization for non-power-of-2 `N`
template <typename T, ptrdiff_t N> 
requires(std::floating_point<T> && (std::popcount(size_t(N))>1))
struct Dual<T,N> {
  Vector<T, N+1> data;
};


template<ptrdiff_t W, typename T>
consteval auto firstoff(){
    static_assert(std::same_as<T,double>, "type not implemented");
    if constexpr (W==2) return Vec<2,int64_t>{0,1} != 0;
    else if constexpr (W == 4) return Vec<4,int64_t>{0,1,2,3} != 0;
    else if constexpr (W == 8) return Vec<8,int64_t>{0,1,2,3,4,5,6,7} != 0;
    else static_assert(false, "vector width not implemented");
}

template <typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(Dual<T, N> a,
                                                Dual<T, N> b)
  -> Dual<T, N> {
  if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
    Dual<T,N> c;
    for (ptrdiff_t l = 0; l < Vector<T,N>::L; ++l)
      c.data.data[l] = a.data.data[l] + b.data.data[l]; 
    return c;
  } else return {a.value + b.value, a.partials + b.partials};
}

template <typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(Dual<T, N> a,
                                                Dual<T, N> b)
  -> Dual<T, N> {
  if constexpr (std::floating_point<T> && (std::popcount(size_t(N))>1)){
    using V = typename Vector<T,N>::V;
    V va = V{}+a.data.data[0][0], vb = V{}+b.data.data[0][0];
    V x = va * b.data.data[0];
    Dual<T,N> c;
    c.data.data[0] = firstoff<Vector<T,N>::W,T>() ? x + vb*a.data.data[0] : x;
    for (ptrdiff_t l = 1; l < Vector<T,N>::L; ++l)
      c.data.data[l] = va*b.data.data[l] + vb*a.data.data[l]; 
    return c;
  } else return {a.value * b.value, a.value * b.partials + b.value *
a.partials};
}

void prod(Dual<Dual<double,7>,2> &c, const Dual<Dual<double,7>,2> &a, const
Dual<Dual<double,7>,2>&b){
    c = a*b;
}
void prod(Dual<Dual<double,8>,2> &c, const Dual<Dual<double,8>,2> &a, const
Dual<Dual<double,8>,2>&b){
    c = a*b;
}


GCC 13.2 asm, when compiling with
-std=gnu++23 -march=skylake-avx512 -mprefer-vector-width=512 -O3


prod(Dual<Dual<double, 7l>, 2l>&, Dual<Dual<double, 7l>, 2l> const&,
Dual<Dual<double, 7l>, 2l> const&):
        push    rbp
        mov     eax, -2
        kmovb   k1, eax
        mov     rbp, rsp
        and     rsp, -64
        sub     rsp, 264
        vmovdqa ymm4, YMMWORD PTR [rsi+128]
        vmovapd zmm8, ZMMWORD PTR [rsi]
        vmovapd zmm9, ZMMWORD PTR [rdx]
        vmovdqa ymm6, YMMWORD PTR [rsi+64]
        vmovdqa YMMWORD PTR [rsp+8], ymm4
        vmovdqa ymm4, YMMWORD PTR [rdx+96]
        vbroadcastsd    zmm0, xmm8
        vmovdqa ymm7, YMMWORD PTR [rsi+96]
        vbroadcastsd    zmm1, xmm9
        vmovdqa YMMWORD PTR [rsp-56], ymm6
        vmovdqa ymm5, YMMWORD PTR [rdx+128]
        vmovdqa ymm6, YMMWORD PTR [rsi+160]
        vmovdqa YMMWORD PTR [rsp+168], ymm4
        vxorpd  xmm4, xmm4, xmm4
        vaddpd  zmm0, zmm0, zmm4
        vaddpd  zmm1, zmm1, zmm4
        vmovdqa YMMWORD PTR [rsp-24], ymm7
        vmovdqa ymm7, YMMWORD PTR [rdx+64]
        vmovapd zmm3, ZMMWORD PTR [rsp-56]
        vmovdqa YMMWORD PTR [rsp+40], ymm6
        vmovdqa ymm6, YMMWORD PTR [rdx+160]
        vmovdqa YMMWORD PTR [rsp+200], ymm5
        vmulpd  zmm2, zmm0, zmm9
        vmovdqa YMMWORD PTR [rsp+136], ymm7
        vmulpd  zmm5, zmm1, zmm3
        vbroadcastsd    zmm3, xmm3
        vmovdqa YMMWORD PTR [rsp+232], ymm6
        vaddpd  zmm3, zmm3, zmm4
        vmovapd zmm7, zmm2
        vmovapd zmm2, ZMMWORD PTR [rsp+8]
        vfmadd231pd     zmm7{k1}, zmm8, zmm1
        vmovapd zmm6, zmm5
        vmovapd zmm5, ZMMWORD PTR [rsp+136]
        vmulpd  zmm1, zmm1, zmm2
        vfmadd231pd     zmm6{k1}, zmm9, zmm3
        vbroadcastsd    zmm2, xmm2
        vmovapd zmm3, ZMMWORD PTR [rsp+200]
        vaddpd  zmm2, zmm2, zmm4
        vmovapd ZMMWORD PTR [rdi], zmm7
        vfmadd231pd     zmm1{k1}, zmm9, zmm2
        vmulpd  zmm2, zmm0, zmm5
        vbroadcastsd    zmm5, xmm5
        vmulpd  zmm0, zmm0, zmm3
        vbroadcastsd    zmm3, xmm3
        vaddpd  zmm5, zmm5, zmm4
        vaddpd  zmm3, zmm3, zmm4
        vfmadd231pd     zmm2{k1}, zmm8, zmm5
        vfmadd231pd     zmm0{k1}, zmm8, zmm3
        vaddpd  zmm2, zmm2, zmm6
        vaddpd  zmm0, zmm0, zmm1
        vmovapd ZMMWORD PTR [rdi+64], zmm2
        vmovapd ZMMWORD PTR [rdi+128], zmm0
        vzeroupper
        leave
        ret
prod(Dual<Dual<double, 8l>, 2l>&, Dual<Dual<double, 8l>, 2l> const&,
Dual<Dual<double, 8l>, 2l> const&):
        push    rbp
        mov     rbp, rsp
        and     rsp, -64
        sub     rsp, 648
        vmovdqa ymm5, YMMWORD PTR [rsi+224]
        vmovdqa ymm3, YMMWORD PTR [rsi+352]
        vmovapd zmm0, ZMMWORD PTR [rdx+64]
        vmovdqa ymm2, YMMWORD PTR [rsi+320]
        vmovdqa YMMWORD PTR [rsp+104], ymm5
        vmovdqa ymm5, YMMWORD PTR [rdx+224]
        vmovdqa ymm7, YMMWORD PTR [rsi+128]
        vmovdqa YMMWORD PTR [rsp+232], ymm3
        vmovsd  xmm3, QWORD PTR [rsi]
        vmovdqa ymm6, YMMWORD PTR [rsi+192]
        vmovdqa YMMWORD PTR [rsp+488], ymm5
        vmovdqa ymm4, YMMWORD PTR [rdx+192]
        vmovapd zmm1, ZMMWORD PTR [rsi+64]
        vbroadcastsd    zmm5, xmm3
        vmovdqa YMMWORD PTR [rsp+200], ymm2
        vmovdqa ymm2, YMMWORD PTR [rdx+320]
        vmulpd  zmm8, zmm5, zmm0
        vmovdqa YMMWORD PTR [rsp+8], ymm7
        vmovdqa ymm7, YMMWORD PTR [rsi+256]
        vmovdqa YMMWORD PTR [rsp+72], ymm6
        vmovdqa ymm6, YMMWORD PTR [rdx+128]
        vmovdqa YMMWORD PTR [rsp+584], ymm2
        vmovsd  xmm2, QWORD PTR [rdx]
        vmovdqa YMMWORD PTR [rsp+136], ymm7
        vmovdqa ymm7, YMMWORD PTR [rdx+256]
        vmovdqa YMMWORD PTR [rsp+392], ymm6
        vmovdqa ymm6, YMMWORD PTR [rdx+352]
        vmulsd  xmm10, xmm3, xmm2
        vmovdqa YMMWORD PTR [rsp+456], ymm4
        vbroadcastsd    zmm4, xmm2
        vfmadd231pd     zmm8, zmm4, zmm1
        vmovdqa YMMWORD PTR [rsp+520], ymm7
        vmovdqa YMMWORD PTR [rsp+616], ymm6
        vmulpd  zmm9, zmm4, ZMMWORD PTR [rsp+72]
        vmovsd  xmm6, QWORD PTR [rsp+520]
        vmulpd  zmm4, zmm4, ZMMWORD PTR [rsp+200]
        vmulpd  zmm11, zmm5, ZMMWORD PTR [rsp+456]
        vmovsd  QWORD PTR [rdi], xmm10
        vmulpd  zmm5, zmm5, ZMMWORD PTR [rsp+584]
        vmovapd ZMMWORD PTR [rdi+64], zmm8
        vfmadd231pd     zmm9, zmm0, QWORD PTR [rsp+8]{1to8}
        vfmadd231pd     zmm4, zmm0, QWORD PTR [rsp+136]{1to8}
        vmovsd  xmm0, QWORD PTR [rsp+392]
        vmulsd  xmm7, xmm3, xmm0
        vbroadcastsd    zmm0, xmm0
        vmulsd  xmm3, xmm3, xmm6
        vfmadd132pd     zmm0, zmm11, zmm1
        vbroadcastsd    zmm6, xmm6
        vfmadd132pd     zmm1, zmm5, zmm6
        vfmadd231sd     xmm7, xmm2, QWORD PTR [rsp+8]
        vfmadd132sd     xmm2, xmm3, QWORD PTR [rsp+136]
        vaddpd  zmm0, zmm0, zmm9
        vaddpd  zmm1, zmm1, zmm4
        vmovapd ZMMWORD PTR [rdi+192], zmm0
        vmovsd  QWORD PTR [rdi+128], xmm7
        vmovsd  QWORD PTR [rdi+256], xmm2
        vmovapd ZMMWORD PTR [rdi+320], zmm1
        vzeroupper
        leave
        ret


Note all the stores to/loads from rsp, and the use of ymm registers.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
@ 2023-12-02 13:20 ` elrodc at gmail dot com
  2023-12-03 13:29 ` [Bug middle-end/112824] " elrodc at gmail dot com
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-02 13:20 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

--- Comment #1 from Chris Elrod <elrodc at gmail dot com> ---
Here I have added a godbolt example where I manually unroll the array, where
GCC generates excellent code https://godbolt.org/z/sd4bhGW7e
I'm not sure it is 100% optimal, but with an inner Dual size of `7`, on
Skylake-X it is 38 uops for unrolled GCC with separate struct fields, vs 49
uops for Clang, vs 67 for GCC with arrays.
uica expects <14 clock cycles for the manually unrolled vs >23 for the array
version.

My experience so far with expression templates has born this out: compilers
seem to struggle with peeling away abstractions.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
  2023-12-02 13:20 ` [Bug tree-optimization/112824] " elrodc at gmail dot com
@ 2023-12-03 13:29 ` elrodc at gmail dot com
  2023-12-03 13:58 ` elrodc at gmail dot com
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-03 13:29 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

--- Comment #2 from Chris Elrod <elrodc at gmail dot com> ---
https://godbolt.org/z/3648aMTz8

Perhaps a simpler diff is that you can reproduce by uncommenting the pragma,
but codegen becomes good with it.

template<typename T, ptrdiff_t N>
constexpr auto operator*(OuterDualUA2<T,N> a, OuterDualUA2<T,N>
b)->OuterDualUA2<T,N>{  
  //return
{a.value*b.value,a.value*b.p[0]+b.value*a.p[0],a.value*b.p[1]+b.value*a.p[1]}; 
  OuterDualUA2<T,N> c;
  c.value = a.value*b.value;
#pragma GCC unroll 16
  for (ptrdiff_t i = 0; i < 2; ++i)
    c.p[i] = a.value*b.p[i] + b.value*a.p[i];
  //c.p[0] = a.value*b.p[0] + b.value*a.p[0];
  //c.p[1] = a.value*b.p[1] + b.value*a.p[1];
  return c;
}

It's not great to have to add pragmas everywhere to my actual codebase. I
thought I hit the important cases, but my non-minimal example still gets
unnecessary register splits and stack spills, so maybe I missed places, or
perhaps there's another issue.

Given that GCC unrolls the above code even without the pragma, it seems like a
definite bug that the pragma is needed for the resulting code generation to
actually be good.
Not knowing the compiler pipeline, my naive guess is that the pragma causes
earlier unrolling than whatever optimization pass does it sans pragma, and that
some important analysis/optimization gets run between those two times.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
  2023-12-02 13:20 ` [Bug tree-optimization/112824] " elrodc at gmail dot com
  2023-12-03 13:29 ` [Bug middle-end/112824] " elrodc at gmail dot com
@ 2023-12-03 13:58 ` elrodc at gmail dot com
  2023-12-04  3:42 ` liuhongt at gcc dot gnu.org
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-03 13:58 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

--- Comment #3 from Chris Elrod <elrodc at gmail dot com> ---
> I thought I hit the important cases, but my non-minimal example still gets unnecessary register splits and stack spills, so maybe I missed places, or perhaps there's another issue.

Adding the unroll pragma to the `Vector`'s operator + and *:

template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y)
-> Vector<T,N> {
    Vector<T,N> z;
#pragma GCC unroll 16
    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] +
y.data[n];
    return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y)
-> Vector<T,N> {
    Vector<T,N> z;
#pragma GCC unroll 16
    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] *
y.data[n];
    return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) ->
Vector<T,N> {
    Vector<T,N> z;
#pragma GCC unroll 16
    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n];
    return z;
}
template<typename T, ptrdiff_t N>
[[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) ->
Vector<T,N> {
    Vector<T,N> z;
#pragma GCC unroll 16
    for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n];
    return z;
}


does not improve code generation (still get the same problem), so that's a
reproducer for such an issue.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
                   ` (2 preceding siblings ...)
  2023-12-03 13:58 ` elrodc at gmail dot com
@ 2023-12-04  3:42 ` liuhongt at gcc dot gnu.org
  2023-12-04 10:04 ` rguenth at gcc dot gnu.org
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: liuhongt at gcc dot gnu.org @ 2023-12-04  3:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

liuhongt at gcc dot gnu.org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |liuhongt at gcc dot gnu.org

--- Comment #4 from liuhongt at gcc dot gnu.org ---
there're 2 reasons.
1. X86_TUNE_AVX512_MOVE_BY_PIECES is used for move struct but it's independent
of -mprefer-vector-width=512, manully add
-mtune-ctrl=-mtune-ctrl=avx512_move_by_pieces can solve most cases.
2. There's still spills for (subreg:DF (reg: V8DF) since ix86_modes_tieable_p
return false for DF and V8DF.

For 1, I guess when there's explicit -mprefer-vector-width=512, it should
rewrite tune X86_TUNE_AVX512_MOVE_BY_PIECES and unset
X86_TUNE_AVX256_OPTIMAL/X86_TUNE_AVX128_OPTIMAL

For 2, according to doc, it should return false since DF can be allocated to
X87 reg, but V8DF only SSE_REGS, so the fix should be using vec_select instead
of subreg for that case?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
                   ` (3 preceding siblings ...)
  2023-12-04  3:42 ` liuhongt at gcc dot gnu.org
@ 2023-12-04 10:04 ` rguenth at gcc dot gnu.org
  2023-12-04 16:43 ` elrodc at gmail dot com
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-12-04 10:04 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
   Last reconfirmed|                            |2023-12-04
             Status|UNCONFIRMED                 |NEW

--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
On GIMPLE there's nothing wrong I think.  There's some inconsistency in
representing a splat where when we have a scalar source we use a CONSTRUCTOR
but when we have a (multi-use) vector we use a VEC_PERM to splat its
first element (but I guess that's actually good in the end).

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
                   ` (4 preceding siblings ...)
  2023-12-04 10:04 ` rguenth at gcc dot gnu.org
@ 2023-12-04 16:43 ` elrodc at gmail dot com
  2023-12-05  3:44 ` liuhongt at gcc dot gnu.org
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-04 16:43 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

--- Comment #6 from Chris Elrod <elrodc at gmail dot com> ---
Hongtao Liu, I do think that one should ideally be able to get optimal codegen
when using 512-bit builtin vectors or vector intrinsics, without needing to set
`-mprefer-vector-width=512` (and, currently, also setting
`-mtune-ctrl=avx512_move_by_pieces`).

For example, if I remove `-mprefer-vector-width=512`, I get

prod(Dual<Dual<double, 7l>, 2l>&, Dual<Dual<double, 7l>, 2l> const&,
Dual<Dual<double, 7l>, 2l> const&):
        push    rbp
        mov     eax, -2
        kmovb   k1, eax
        mov     rbp, rsp
        and     rsp, -64
        sub     rsp, 264
        vmovdqa ymm4, YMMWORD PTR [rsi+128]
        vmovapd zmm8, ZMMWORD PTR [rsi]
        vmovapd zmm9, ZMMWORD PTR [rdx]
        vmovdqa ymm6, YMMWORD PTR [rsi+64]
        vmovdqa YMMWORD PTR [rsp+8], ymm4
        vmovdqa ymm4, YMMWORD PTR [rdx+96]
        vbroadcastsd    zmm0, xmm8
        vmovdqa ymm7, YMMWORD PTR [rsi+96]
        vbroadcastsd    zmm1, xmm9
        vmovdqa YMMWORD PTR [rsp-56], ymm6
        vmovdqa ymm5, YMMWORD PTR [rdx+128]
        vmovdqa ymm6, YMMWORD PTR [rsi+160]
        vmovdqa YMMWORD PTR [rsp+168], ymm4
        vxorpd  xmm4, xmm4, xmm4
        vaddpd  zmm0, zmm0, zmm4
        vaddpd  zmm1, zmm1, zmm4
        vmovdqa YMMWORD PTR [rsp-24], ymm7
        vmovdqa ymm7, YMMWORD PTR [rdx+64]
        vmovapd zmm3, ZMMWORD PTR [rsp-56]
        vmovdqa YMMWORD PTR [rsp+40], ymm6
        vmovdqa ymm6, YMMWORD PTR [rdx+160]
        vmovdqa YMMWORD PTR [rsp+200], ymm5
        vmulpd  zmm2, zmm0, zmm9
        vmovdqa YMMWORD PTR [rsp+136], ymm7
        vmulpd  zmm5, zmm1, zmm3
        vbroadcastsd    zmm3, xmm3
        vmovdqa YMMWORD PTR [rsp+232], ymm6
        vaddpd  zmm3, zmm3, zmm4
        vmovapd zmm7, zmm2
        vmovapd zmm2, ZMMWORD PTR [rsp+8]
        vfmadd231pd     zmm7{k1}, zmm8, zmm1
        vmovapd zmm6, zmm5
        vmovapd zmm5, ZMMWORD PTR [rsp+136]
        vmulpd  zmm1, zmm1, zmm2
        vfmadd231pd     zmm6{k1}, zmm9, zmm3
        vbroadcastsd    zmm2, xmm2
        vmovapd zmm3, ZMMWORD PTR [rsp+200]
        vaddpd  zmm2, zmm2, zmm4
        vmovapd ZMMWORD PTR [rdi], zmm7
        vfmadd231pd     zmm1{k1}, zmm9, zmm2
        vmulpd  zmm2, zmm0, zmm5
        vbroadcastsd    zmm5, xmm5
        vmulpd  zmm0, zmm0, zmm3
        vbroadcastsd    zmm3, xmm3
        vaddpd  zmm5, zmm5, zmm4
        vaddpd  zmm3, zmm3, zmm4
        vfmadd231pd     zmm2{k1}, zmm8, zmm5
        vfmadd231pd     zmm0{k1}, zmm8, zmm3
        vaddpd  zmm2, zmm2, zmm6
        vaddpd  zmm0, zmm0, zmm1
        vmovapd ZMMWORD PTR [rdi+64], zmm2
        vmovapd ZMMWORD PTR [rdi+128], zmm0
        vzeroupper
        leave
        ret
prod(Dual<Dual<double, 8l>, 2l>&, Dual<Dual<double, 8l>, 2l> const&,
Dual<Dual<double, 8l>, 2l> const&):
        push    rbp
        mov     rbp, rsp
        and     rsp, -64
        sub     rsp, 648
        vmovdqa ymm5, YMMWORD PTR [rsi+224]
        vmovdqa ymm3, YMMWORD PTR [rsi+352]
        vmovapd zmm0, ZMMWORD PTR [rdx+64]
        vmovdqa ymm2, YMMWORD PTR [rsi+320]
        vmovdqa YMMWORD PTR [rsp+104], ymm5
        vmovdqa ymm5, YMMWORD PTR [rdx+224]
        vmovdqa ymm7, YMMWORD PTR [rsi+128]
        vmovdqa YMMWORD PTR [rsp+232], ymm3
        vmovsd  xmm3, QWORD PTR [rsi]
        vmovdqa ymm6, YMMWORD PTR [rsi+192]
        vmovdqa YMMWORD PTR [rsp+488], ymm5
        vmovdqa ymm4, YMMWORD PTR [rdx+192]
        vmovapd zmm1, ZMMWORD PTR [rsi+64]
        vbroadcastsd    zmm5, xmm3
        vmovdqa YMMWORD PTR [rsp+200], ymm2
        vmovdqa ymm2, YMMWORD PTR [rdx+320]
        vmulpd  zmm8, zmm5, zmm0
        vmovdqa YMMWORD PTR [rsp+8], ymm7
        vmovdqa ymm7, YMMWORD PTR [rsi+256]
        vmovdqa YMMWORD PTR [rsp+72], ymm6
        vmovdqa ymm6, YMMWORD PTR [rdx+128]
        vmovdqa YMMWORD PTR [rsp+584], ymm2
        vmovsd  xmm2, QWORD PTR [rdx]
        vmovdqa YMMWORD PTR [rsp+136], ymm7
        vmovdqa ymm7, YMMWORD PTR [rdx+256]
        vmovdqa YMMWORD PTR [rsp+392], ymm6
        vmovdqa ymm6, YMMWORD PTR [rdx+352]
        vmulsd  xmm10, xmm3, xmm2
        vmovdqa YMMWORD PTR [rsp+456], ymm4
        vbroadcastsd    zmm4, xmm2
        vfmadd231pd     zmm8, zmm4, zmm1
        vmovdqa YMMWORD PTR [rsp+520], ymm7
        vmovdqa YMMWORD PTR [rsp+616], ymm6
        vmulpd  zmm9, zmm4, ZMMWORD PTR [rsp+72]
        vmovsd  xmm6, QWORD PTR [rsp+520]
        vmulpd  zmm4, zmm4, ZMMWORD PTR [rsp+200]
        vmulpd  zmm11, zmm5, ZMMWORD PTR [rsp+456]
        vmovsd  QWORD PTR [rdi], xmm10
        vmulpd  zmm5, zmm5, ZMMWORD PTR [rsp+584]
        vmovapd ZMMWORD PTR [rdi+64], zmm8
        vfmadd231pd     zmm9, zmm0, QWORD PTR [rsp+8]{1to8}
        vfmadd231pd     zmm4, zmm0, QWORD PTR [rsp+136]{1to8}
        vmovsd  xmm0, QWORD PTR [rsp+392]
        vmulsd  xmm7, xmm3, xmm0
        vbroadcastsd    zmm0, xmm0
        vmulsd  xmm3, xmm3, xmm6
        vfmadd132pd     zmm0, zmm11, zmm1
        vbroadcastsd    zmm6, xmm6
        vfmadd132pd     zmm1, zmm5, zmm6
        vfmadd231sd     xmm7, xmm2, QWORD PTR [rsp+8]
        vfmadd132sd     xmm2, xmm3, QWORD PTR [rsp+136]
        vaddpd  zmm0, zmm0, zmm9
        vaddpd  zmm1, zmm1, zmm4
        vmovapd ZMMWORD PTR [rdi+192], zmm0
        vmovsd  QWORD PTR [rdi+128], xmm7
        vmovsd  QWORD PTR [rdi+256], xmm2
        vmovapd ZMMWORD PTR [rdi+320], zmm1
        vzeroupper
        leave
        ret


GCC respects the vector builtins and uses 512 bit ops, but then does splits and
spills across function boundaries.
So, what I'm arguing is, while it would be great to respect
`-mprefer-vector-width=512`, it should ideally also be able to respect vector
builtins/intrinsics, so that one can use full width vectors without also having
to set `-mprefer-vector-width=512 -mtune-control=avx512_move_by_pieces`.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
                   ` (5 preceding siblings ...)
  2023-12-04 16:43 ` elrodc at gmail dot com
@ 2023-12-05  3:44 ` liuhongt at gcc dot gnu.org
  2023-12-05  5:06 ` elrodc at gmail dot com
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: liuhongt at gcc dot gnu.org @ 2023-12-05  3:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

--- Comment #7 from Hongtao Liu <liuhongt at gcc dot gnu.org> ---
(In reply to Chris Elrod from comment #6)
> Hongtao Liu, I do think that one should ideally be able to get optimal
> codegen when using 512-bit builtin vectors or vector intrinsics, without
> needing to set `-mprefer-vector-width=512` (and, currently, also setting
> `-mtune-ctrl=avx512_move_by_pieces`).
> 
> 
> GCC respects the vector builtins and uses 512 bit ops, but then does splits
> and spills across function boundaries.
> So, what I'm arguing is, while it would be great to respect
> `-mprefer-vector-width=512`, it should ideally also be able to respect
> vector builtins/intrinsics, so that one can use full width vectors without
> also having to set `-mprefer-vector-width=512
> -mtune-control=avx512_move_by_pieces`.
If it's designed the way you want it to be， another issue would be like, should
we lower 512-bit vector builtins/intrinsic to ymm/xmm when
-mprefer-vector-width=256, the answer is we'd rather not. The intrinsic should
be closer to a one-to-one correspondence of instructions.(Though there're
several instrinics which are lowered to a sequence of instructions)
There're also others users using 512-bit intriniscs for specific kernel loops,
but still want compiler to generate xmm/ymm for other codes,
-mprefer-vector-width=256.

Originally -mprefer-vector-width=XXX is designed for auto-vectorization, and
-mtune-ctrl=avx512_move_by_pieces is for memory movement. Both of which are
orthogonal to codegen for builtin, intrinsic or explicit vector types. If user
explicitly use 512-bit vector type, builtins or intrinsics, gcc will generate
zmm no matter -mprefer-vector-width=.

And yes, there could be some mismatches between 512-bit intrinsic and
architecture tuning when you're using 512-bit intrinsic, and also rely on
compiler autogen to handle struct
(struct Dual<T,N> {
  Vector<T, N+1> data;
};). 
For such case, an explicit -mprefer-vector-width=512 is needed.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
                   ` (6 preceding siblings ...)
  2023-12-05  3:44 ` liuhongt at gcc dot gnu.org
@ 2023-12-05  5:06 ` elrodc at gmail dot com
  2023-12-07  7:21 ` wwwhhhyyy333 at gmail dot com
  2023-12-15  2:36 ` cvs-commit at gcc dot gnu.org
  9 siblings, 0 replies; 11+ messages in thread
From: elrodc at gmail dot com @ 2023-12-05  5:06 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

--- Comment #8 from Chris Elrod <elrodc at gmail dot com> ---
> If it's designed the way you want it to be， another issue would be like, should we lower 512-bit vector builtins/intrinsic to ymm/xmm when -mprefer-vector-width=256, the answer is we'd rather not. 

To be clear, what I meant by

>  it would be great to respect
> `-mprefer-vector-width=512`, it should ideally also be able to respect
> vector builtins/intrinsics

is that when someone uses 512 bit vector builtins, that codegen should generate
512 bit code regardless of `mprefer-vector-width` settings.
That is, as a developer, I would want 512 bit builtins to mean we get 512-bit
vector code generation.

>  If user explicitly use 512-bit vector type, builtins or intrinsics, gcc will generate zmm no matter -mprefer-vector-width=.

This is what I would want, and I'd also want it to apply to movement of
`struct`s holding vector builtin objects, instead of the `ymm` usage as we see
here.

> And yes, there could be some mismatches between 512-bit intrinsic and architecture tuning when you're using 512-bit intrinsic, and also rely on compiler autogen to handle struct
> For such case, an explicit -mprefer-vector-width=512 is needed.

Note the template partial specialization

template <std::floating_point T, ptrdiff_t N> struct Vector<T,N>{
    static constexpr ptrdiff_t W = N >= VecWidth<T> ? VecWidth<T> :
ptrdiff_t(std::bit_ceil(size_t(N))); 
    static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0);
    using V = Vec<W,T>;
    V data[L];
    static constexpr auto size()->ptrdiff_t{return N;}
};

Thus, `Vector`s in this example may explicitly be structs containing arrays of
vector builtins. I would expect these structs to not need an
`mprefer-vector-width=512` setting for producing 512 bit code handling this
struct.
Given small `L`, I would also expect passing this struct as an argument by
value to a non-inlined function to be done in `zmm` registers when possible,
for example.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
                   ` (7 preceding siblings ...)
  2023-12-05  5:06 ` elrodc at gmail dot com
@ 2023-12-07  7:21 ` wwwhhhyyy333 at gmail dot com
  2023-12-15  2:36 ` cvs-commit at gcc dot gnu.org
  9 siblings, 0 replies; 11+ messages in thread
From: wwwhhhyyy333 at gmail dot com @ 2023-12-07  7:21 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

Hongyu Wang <wwwhhhyyy333 at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |wwwhhhyyy333 at gmail dot com

--- Comment #9 from Hongyu Wang <wwwhhhyyy333 at gmail dot com> ---
(In reply to Hongtao Liu from comment #4)
> there're 2 reasons.

> 2. There's still spills for (subreg:DF (reg: V8DF) since
> ix86_modes_tieable_p return false for DF and V8DF.
> 

There could be some issue in sra that the aggregates are not properly
scalarized due to size limit.

The sra considers maximum aggregate size using move_ratio * UNITS_PER_WORD, but
here the aggregate Dual<Dual<double, 8l>, 2l> actually contains several V8DF
component that can be handled in zmm under avx512f. 

Add --param sra-max-scalarization-size-Ospeed=2048 will eliminate those spills

So for sra we can consider using MOVE_MAX * move_ratio as the size limit for
Ospeed which represents real backend instruction count.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug middle-end/112824] Stack spills and vector splitting with vector builtins
  2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
                   ` (8 preceding siblings ...)
  2023-12-07  7:21 ` wwwhhhyyy333 at gmail dot com
@ 2023-12-15  2:36 ` cvs-commit at gcc dot gnu.org
  9 siblings, 0 replies; 11+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2023-12-15  2:36 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824

--- Comment #10 from GCC Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Hongyu Wang <hongyuw@gcc.gnu.org>:

https://gcc.gnu.org/g:a52940cfee0908aed0d2a674a451f6d99844444d

commit r14-6575-ga52940cfee0908aed0d2a674a451f6d99844444d
Author: Hongyu Wang <hongyu.wang@intel.com>
Date:   Mon Dec 11 09:15:02 2023 +0800

    i386: Sync move_max/store_max with prefer-vector-width [PR112824]

    Currently move_max follows the tuning feature first, but ideally it
    should sync with prefer-vector-width when it is explicitly set to keep
    vector move and operation with same vector size.

    gcc/ChangeLog:

            PR target/112824
            * config/i386/i386-options.cc (ix86_option_override_internal):
            Sync ix86_move_max/ix86_store_max with prefer_vector_width when
            it is explicitly set.

    gcc/testsuite/ChangeLog:

            PR target/112824
            * gcc.target/i386/pieces-memset-45.c: Remove
            -mprefer-vector-width=256.
            * g++.target/i386/pr112824-1.C: New test.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-12-15  2:36 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-02 11:42 [Bug tree-optimization/112824] New: Stack spills and vector splitting with vector builtins elrodc at gmail dot com
2023-12-02 13:20 ` [Bug tree-optimization/112824] " elrodc at gmail dot com
2023-12-03 13:29 ` [Bug middle-end/112824] " elrodc at gmail dot com
2023-12-03 13:58 ` elrodc at gmail dot com
2023-12-04  3:42 ` liuhongt at gcc dot gnu.org
2023-12-04 10:04 ` rguenth at gcc dot gnu.org
2023-12-04 16:43 ` elrodc at gmail dot com
2023-12-05  3:44 ` liuhongt at gcc dot gnu.org
2023-12-05  5:06 ` elrodc at gmail dot com
2023-12-07  7:21 ` wwwhhhyyy333 at gmail dot com
2023-12-15  2:36 ` cvs-commit at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).