[Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang
@ 2020-08-21 18:34 euloanty at live dot com
  2020-08-24 20:10 ` [Bug tree-optimization/96738] " pinskia at gcc dot gnu.org
  2020-08-25 11:36 ` rguenth at gcc dot gnu.org
  0 siblings, 2 replies; 3+ messages in thread
From: euloanty at live dot com @ 2020-08-21 18:34 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96738

            Bug ID: 96738
           Summary: GCC generates worse assembly than clang and It fails
                    to vectorized code compared to clang
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: euloanty at live dot com
  Target Milestone: ---

https://godbolt.org/z/9K3369

#include<array>
#include<cstdint>

struct number
{
        std::array<std::uint64_t,5> num;


        inline constexpr std::uint64_t& operator[](std::size_t position)
noexcept
        {
                return num[position];
        }
        inline constexpr std::uint64_t const& operator[](std::size_t position)
const noexcept
        {
                return num[position];
        }
};


number add_reduce(number const& a,number const& b) noexcept
{
    constexpr auto reduce_mask_51{(static_cast<std::uint64_t>(1) << 51) - 1};
    number out;
        std::uint64_t c{(a[0] + b[0])>>51};
        out[1] = a[1] + b[1] + c; c = (out[1] >> 51); out[1] &= reduce_mask_51;
        out[2] = a[2] + b[2] + c; c = (out[2] >> 51); out[2] &= reduce_mask_51;
        out[3] = a[3] + b[3] + c; c = (out[3] >> 51); out[3] &= reduce_mask_51;
        out[4] = a[4] + b[4] + c; c = (out[4] >> 51); out[4] &= reduce_mask_51;
        out[0] = c * 19;
        return out;
}


gcc:

add_reduce(number const&, number const&):
        movq    (%rdx), %rax
        addq    (%rsi), %rax
        movq    %rdi, %r8
        movq    %rdx, %rdi
        shrq    $51, %rax
        movq    8(%rdx), %rdx
        addq    8(%rsi), %rdx
        movq    %rsi, %rcx
        movabsq $2251799813685247, %rsi
        addq    %rdx, %rax
        movq    %rax, %rdx
        shrq    $51, %rax
        andq    %rsi, %rdx
        movq    %rdx, 8(%r8)
        movq    16(%rdi), %rdx
        addq    16(%rcx), %rdx
        addq    %rdx, %rax
        movq    %rax, %rdx
        shrq    $51, %rax
        andq    %rsi, %rdx
        movq    %rdx, 16(%r8)
        movq    24(%rdi), %rdx
        addq    24(%rcx), %rdx
        addq    %rax, %rdx
        movq    %rdx, %rax
        shrq    $51, %rdx
        andq    %rsi, %rax
        movq    %rax, 24(%r8)
        movq    32(%rdi), %rax
        addq    32(%rcx), %rax
        addq    %rdx, %rax
        andq    %rax, %rsi
        shrq    $51, %rax
        leaq    (%rax,%rax,8), %rdx
        movq    %rsi, 32(%r8)
        leaq    (%rax,%rdx,2), %rax
        movq    %rax, (%r8)
        movq    %r8, %rax
        ret

clang:
add_reduce(number const&, number const&):             # @add_reduce(number
const&, number const&)
        movq    %rdi, %rax
        movq    (%rdx), %rcx
        movq    8(%rdx), %rdi
        addq    (%rsi), %rcx
        shrq    $51, %rcx
        addq    8(%rsi), %rdi
        addq    %rcx, %rdi
        movq    %rdi, %rcx
        shrq    $51, %rcx
        movabsq $2251799813685247, %r8          # imm = 0x7FFFFFFFFFFFF
        andq    %r8, %rdi
        movq    %rdi, 8(%rax)
        movq    16(%rdx), %rdi
        addq    16(%rsi), %rdi
        addq    %rcx, %rdi
        movq    %rdi, %rcx
        shrq    $51, %rcx
        andq    %r8, %rdi
        movq    %rdi, 16(%rax)
        movq    24(%rdx), %rdi
        addq    24(%rsi), %rdi
        addq    %rcx, %rdi
        movq    %rdi, %rcx
        andq    %r8, %rdi
        movq    %rdi, 24(%rax)
        movq    32(%rdx), %rdx
        addq    32(%rsi), %rdx
        shrq    $51, %rcx
        addq    %rcx, %rdx
        movq    %rdx, %rcx
        shrq    $51, %rcx
        andq    %r8, %rdx
        movq    %rdx, 32(%rax)
        leaq    (%rcx,%rcx,8), %rdx
        leaq    (%rcx,%rdx,2), %rcx
        movq    %rcx, (%rax)
        retq

clang with -march=native

.LCPI0_0:
        .quad   2251799813685247
add_reduce(number const&, number const&):             # @add_reduce(number
const&, number const&)
        movq    %rdi, %rax
        movq    (%rdx), %rcx
        movq    8(%rdx), %rdi
        addq    (%rsi), %rcx
        shrq    $51, %rcx
        addq    8(%rsi), %rdi
        addq    %rcx, %rdi
        vmovq   %rdi, %xmm0
        shrq    $51, %rdi
        movq    16(%rdx), %rcx
        addq    16(%rsi), %rcx
        addq    %rdi, %rcx
        vmovq   %rcx, %xmm1
        shrq    $51, %rcx
        movq    24(%rdx), %rdi
        addq    24(%rsi), %rdi
        addq    %rcx, %rdi
        vmovq   %rdi, %xmm2
        shrq    $51, %rdi
        movq    32(%rdx), %rcx
        addq    32(%rsi), %rcx
        addq    %rdi, %rcx
        vpunpcklqdq     %xmm1, %xmm0, %xmm0     # xmm0 = xmm0[0],xmm1[0]
        vmovq   %rcx, %xmm1
        vpunpcklqdq     %xmm1, %xmm2, %xmm1     # xmm1 = xmm2[0],xmm1[0]
        vinserti128     $1, %xmm1, %ymm0, %ymm0
        vpandq  .LCPI0_0(%rip){1to4}, %ymm0, %ymm0
        shrq    $51, %rcx
        vmovdqu %ymm0, 8(%rax)
        leaq    (%rcx,%rcx,8), %rdx
        leaq    (%rcx,%rdx,2), %rcx
        movq    %rcx, (%rax)
        vzeroupper
        retq

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Bug tree-optimization/96738] GCC generates worse assembly than clang and It fails to vectorized code compared to clang
  2020-08-21 18:34 [Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang euloanty at live dot com
@ 2020-08-24 20:10 ` pinskia at gcc dot gnu.org
  2020-08-25 11:36 ` rguenth at gcc dot gnu.org
  1 sibling, 0 replies; 3+ messages in thread
From: pinskia at gcc dot gnu.org @ 2020-08-24 20:10 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96738

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Target|All                         |x86_64-linux-gnu
          Component|rtl-optimization            |tree-optimization
           Severity|normal                      |enhancement
              Build|All                         |
               Host|All                         |

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Bug tree-optimization/96738] GCC generates worse assembly than clang and It fails to vectorized code compared to clang
  2020-08-21 18:34 [Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang euloanty at live dot com
  2020-08-24 20:10 ` [Bug tree-optimization/96738] " pinskia at gcc dot gnu.org
@ 2020-08-25 11:36 ` rguenth at gcc dot gnu.org
  1 sibling, 0 replies; 3+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-08-25 11:36 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96738

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
     Ever confirmed|0                           |1
   Last reconfirmed|                            |2020-08-25

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
GCC fails to SLP vectorize this because SLP discovery fails for this reduction
scheme.  The only think clang vectorizes seems to be the mask reduction
and the store.

We're not falling back to scalar operand construction for the mask and
for some reason.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-08-25 11:36 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-21 18:34 [Bug rtl-optimization/96738] New: GCC generates worse assembly than clang and It fails to vectorized code compared to clang euloanty at live dot com
2020-08-24 20:10 ` [Bug tree-optimization/96738] " pinskia at gcc dot gnu.org
2020-08-25 11:36 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).