public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang
@ 2020-08-21 18:34 euloanty at live dot com
2020-08-24 20:10 ` [Bug tree-optimization/96738] " pinskia at gcc dot gnu.org
2020-08-25 11:36 ` rguenth at gcc dot gnu.org
0 siblings, 2 replies; 3+ messages in thread
From: euloanty at live dot com @ 2020-08-21 18:34 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96738
Bug ID: 96738
Summary: GCC generates worse assembly than clang and It fails
to vectorized code compared to clang
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: euloanty at live dot com
Target Milestone: ---
https://godbolt.org/z/9K3369
#include<array>
#include<cstdint>
struct number
{
std::array<std::uint64_t,5> num;
inline constexpr std::uint64_t& operator[](std::size_t position)
noexcept
{
return num[position];
}
inline constexpr std::uint64_t const& operator[](std::size_t position)
const noexcept
{
return num[position];
}
};
number add_reduce(number const& a,number const& b) noexcept
{
constexpr auto reduce_mask_51{(static_cast<std::uint64_t>(1) << 51) - 1};
number out;
std::uint64_t c{(a[0] + b[0])>>51};
out[1] = a[1] + b[1] + c; c = (out[1] >> 51); out[1] &= reduce_mask_51;
out[2] = a[2] + b[2] + c; c = (out[2] >> 51); out[2] &= reduce_mask_51;
out[3] = a[3] + b[3] + c; c = (out[3] >> 51); out[3] &= reduce_mask_51;
out[4] = a[4] + b[4] + c; c = (out[4] >> 51); out[4] &= reduce_mask_51;
out[0] = c * 19;
return out;
}
gcc:
add_reduce(number const&, number const&):
movq (%rdx), %rax
addq (%rsi), %rax
movq %rdi, %r8
movq %rdx, %rdi
shrq $51, %rax
movq 8(%rdx), %rdx
addq 8(%rsi), %rdx
movq %rsi, %rcx
movabsq $2251799813685247, %rsi
addq %rdx, %rax
movq %rax, %rdx
shrq $51, %rax
andq %rsi, %rdx
movq %rdx, 8(%r8)
movq 16(%rdi), %rdx
addq 16(%rcx), %rdx
addq %rdx, %rax
movq %rax, %rdx
shrq $51, %rax
andq %rsi, %rdx
movq %rdx, 16(%r8)
movq 24(%rdi), %rdx
addq 24(%rcx), %rdx
addq %rax, %rdx
movq %rdx, %rax
shrq $51, %rdx
andq %rsi, %rax
movq %rax, 24(%r8)
movq 32(%rdi), %rax
addq 32(%rcx), %rax
addq %rdx, %rax
andq %rax, %rsi
shrq $51, %rax
leaq (%rax,%rax,8), %rdx
movq %rsi, 32(%r8)
leaq (%rax,%rdx,2), %rax
movq %rax, (%r8)
movq %r8, %rax
ret
clang:
add_reduce(number const&, number const&): # @add_reduce(number
const&, number const&)
movq %rdi, %rax
movq (%rdx), %rcx
movq 8(%rdx), %rdi
addq (%rsi), %rcx
shrq $51, %rcx
addq 8(%rsi), %rdi
addq %rcx, %rdi
movq %rdi, %rcx
shrq $51, %rcx
movabsq $2251799813685247, %r8 # imm = 0x7FFFFFFFFFFFF
andq %r8, %rdi
movq %rdi, 8(%rax)
movq 16(%rdx), %rdi
addq 16(%rsi), %rdi
addq %rcx, %rdi
movq %rdi, %rcx
shrq $51, %rcx
andq %r8, %rdi
movq %rdi, 16(%rax)
movq 24(%rdx), %rdi
addq 24(%rsi), %rdi
addq %rcx, %rdi
movq %rdi, %rcx
andq %r8, %rdi
movq %rdi, 24(%rax)
movq 32(%rdx), %rdx
addq 32(%rsi), %rdx
shrq $51, %rcx
addq %rcx, %rdx
movq %rdx, %rcx
shrq $51, %rcx
andq %r8, %rdx
movq %rdx, 32(%rax)
leaq (%rcx,%rcx,8), %rdx
leaq (%rcx,%rdx,2), %rcx
movq %rcx, (%rax)
retq
clang with -march=native
.LCPI0_0:
.quad 2251799813685247
add_reduce(number const&, number const&): # @add_reduce(number
const&, number const&)
movq %rdi, %rax
movq (%rdx), %rcx
movq 8(%rdx), %rdi
addq (%rsi), %rcx
shrq $51, %rcx
addq 8(%rsi), %rdi
addq %rcx, %rdi
vmovq %rdi, %xmm0
shrq $51, %rdi
movq 16(%rdx), %rcx
addq 16(%rsi), %rcx
addq %rdi, %rcx
vmovq %rcx, %xmm1
shrq $51, %rcx
movq 24(%rdx), %rdi
addq 24(%rsi), %rdi
addq %rcx, %rdi
vmovq %rdi, %xmm2
shrq $51, %rdi
movq 32(%rdx), %rcx
addq 32(%rsi), %rcx
addq %rdi, %rcx
vpunpcklqdq %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0]
vmovq %rcx, %xmm1
vpunpcklqdq %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[0]
vinserti128 $1, %xmm1, %ymm0, %ymm0
vpandq .LCPI0_0(%rip){1to4}, %ymm0, %ymm0
shrq $51, %rcx
vmovdqu %ymm0, 8(%rax)
leaq (%rcx,%rcx,8), %rdx
leaq (%rcx,%rdx,2), %rcx
movq %rcx, (%rax)
vzeroupper
retq
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug tree-optimization/96738] GCC generates worse assembly than clang and It fails to vectorized code compared to clang
2020-08-21 18:34 [Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang euloanty at live dot com
@ 2020-08-24 20:10 ` pinskia at gcc dot gnu.org
2020-08-25 11:36 ` rguenth at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: pinskia at gcc dot gnu.org @ 2020-08-24 20:10 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96738
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Target|All |x86_64-linux-gnu
Component|rtl-optimization |tree-optimization
Severity|normal |enhancement
Build|All |
Host|All |
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug tree-optimization/96738] GCC generates worse assembly than clang and It fails to vectorized code compared to clang
2020-08-21 18:34 [Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang euloanty at live dot com
2020-08-24 20:10 ` [Bug tree-optimization/96738] " pinskia at gcc dot gnu.org
@ 2020-08-25 11:36 ` rguenth at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-08-25 11:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96738
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
Last reconfirmed| |2020-08-25
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
GCC fails to SLP vectorize this because SLP discovery fails for this reduction
scheme. The only think clang vectorizes seems to be the mask reduction
and the store.
We're not falling back to scalar operand construction for the mask and
for some reason.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-08-25 11:36 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-21 18:34 [Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang euloanty at live dot com
2020-08-24 20:10 ` [Bug tree-optimization/96738] " pinskia at gcc dot gnu.org
2020-08-25 11:36 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).