public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c++/108320] New: Missing vector/array arithmetic optimization compared to valarray
@ 2023-01-06 21:44 diegoandres91b at hotmail dot com
  2023-01-06 21:54 ` [Bug tree-optimization/108320] " pinskia at gcc dot gnu.org
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: diegoandres91b at hotmail dot com @ 2023-01-06 21:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108320

            Bug ID: 108320
           Summary: Missing vector/array arithmetic optimization compared
                    to valarray
           Product: gcc
           Version: 12.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: diegoandres91b at hotmail dot com
  Target Milestone: ---

The next code (with -O3 -mavx2 -mfma):

#include <valarray>
#include <vector>
#include <array>

using namespace std;

valarray<float> fma1(const valarray<float> &a, const valarray<float> &b, const
valarray<float> &c) {
    return a * b + c;
}

template<class T>
struct vec : vector<T> {
    constexpr vec(size_t count) : vector<T>(count) {}
};

template<class T>
constexpr vec<T> operator*(const vec<T> &a, const vec<T> &b) {
    vec<T> c(a.size());
    for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] * b[i];
    return c;
}

template<class T>
constexpr vec<T> operator+(const vec<T> &a, const vec<T> &b) {
    vec<T> c(a.size());
    for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] + b[i];
    return c;
}

vec<float> fma2(const vec<float> &a, const vec<float> &b, const vec<float> &c)
{
    return a * b + c;
}

template<class T, size_t N>
struct arr : array<T, N> {
};

template<class T, size_t N>
constexpr arr<T, N> operator*(const arr<T, N> &a, const arr<T, N> &b) {
    arr<T, N> c;
    for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] * b[i];
    return c;
}

template<class T, size_t N>
constexpr arr<T, N> operator+(const arr<T, N> &a, const arr<T, N> &b) {
    arr<T, N> c;
    for (size_t i = 0; i < c.size(); ++i) c[i] = a[i] + b[i];
    return c;
}

constexpr size_t N = 1024;

arr<float, N> fma3(const arr<float, N> &a, const arr<float, N> &b, const
arr<float, N> &c) {
    return a * b + c;
}

Only optimizes the valarray version (fma1) of the fma function (uses
vfmadd132ps):

...

.L4:
        vmovups ymm0, YMMWORD PTR [rdi+rax]
        vmovups ymm1, YMMWORD PTR [rcx+rax]
        vfmadd132ps     ymm0, ymm1, YMMWORD PTR [rsi+rax]
        vmovups YMMWORD PTR [rdx+rax], ymm0
        add     rax, 32
        cmp     rax, r8
        jne     .L4
        mov     rax, r10
        and     rax, -8
        lea     r9, [0+rax*4]
        lea     r11, [rdx+r9]
        test    r10b, 7
        je      .L22
        vzeroupper
.L3:
        mov     r8, r10
        sub     r8, rax
        lea     r12, [r8-1]
        cmp     r12, 2
        jbe     .L6
        vmovups xmm0, XMMWORD PTR [rdi+rax*4]
        vmovups xmm2, XMMWORD PTR [rcx+rax*4]
        vfmadd132ps     xmm0, xmm2, XMMWORD PTR [rsi+rax*4]
        vmovups XMMWORD PTR [rdx+r9], xmm0
        test    r8b, 3
        je      .L1
        and     r8, -4
        add     rax, r8
        lea     r11, [r11+r8*4]
        lea     r9, [0+rax*4]

...

But it does not optimize the vector or array versions of the function (fma2 and
fma3).

Note: For smaller N in fma3 optimizes, but for larger numbers like 1024 in the
example it does not.

Compiler Explorer code: https://godbolt.org/z/v8dnx5aMo

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug tree-optimization/108320] Missing vector/array arithmetic optimization compared to valarray
  2023-01-06 21:44 [Bug c++/108320] New: Missing vector/array arithmetic optimization compared to valarray diegoandres91b at hotmail dot com
@ 2023-01-06 21:54 ` pinskia at gcc dot gnu.org
  2023-01-06 21:57 ` pinskia at gcc dot gnu.org
  2023-01-06 21:59 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-01-06 21:54 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108320

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
          Component|c++                         |tree-optimization
           Severity|normal                      |enhancement
           Keywords|                            |missed-optimization

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug tree-optimization/108320] Missing vector/array arithmetic optimization compared to valarray
  2023-01-06 21:44 [Bug c++/108320] New: Missing vector/array arithmetic optimization compared to valarray diegoandres91b at hotmail dot com
  2023-01-06 21:54 ` [Bug tree-optimization/108320] " pinskia at gcc dot gnu.org
@ 2023-01-06 21:57 ` pinskia at gcc dot gnu.org
  2023-01-06 21:59 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-01-06 21:57 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108320

--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
GCC does not implement loop fusion and you need loop fusion to handle fma2/fma3

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug tree-optimization/108320] Missing vector/array arithmetic optimization compared to valarray
  2023-01-06 21:44 [Bug c++/108320] New: Missing vector/array arithmetic optimization compared to valarray diegoandres91b at hotmail dot com
  2023-01-06 21:54 ` [Bug tree-optimization/108320] " pinskia at gcc dot gnu.org
  2023-01-06 21:57 ` pinskia at gcc dot gnu.org
@ 2023-01-06 21:59 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-01-06 21:59 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108320

--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
valarray is specifically designed to allow these kind of optimizations really.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-01-06 21:59 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-06 21:44 [Bug c++/108320] New: Missing vector/array arithmetic optimization compared to valarray diegoandres91b at hotmail dot com
2023-01-06 21:54 ` [Bug tree-optimization/108320] " pinskia at gcc dot gnu.org
2023-01-06 21:57 ` pinskia at gcc dot gnu.org
2023-01-06 21:59 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).