public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/99548] New: Help me! Lost the fight against the compiler.
@ 2021-03-11 11:00 unlvsur at live dot com
  2021-03-11 20:10 ` [Bug target/99548] " pinskia at gcc dot gnu.org
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: unlvsur at live dot com @ 2021-03-11 11:00 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548

            Bug ID: 99548
           Summary: Help me! Lost the fight against the compiler.
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: unlvsur at live dot com
  Target Milestone: ---

https://godbolt.org/z/Kd8q57

The compiler just could not generate as good as assembly as I manually wrote.
Unfortunately, due to the calling convention issues, the inline asm version
would never be portable and hurt other compiler optimizations.


Some patterns like neither GCC nor clang could correctly deal with it.
    std::uint64_t v=0;//set 0 to make compiler happy or it is UB.
    sub_borrow(carry,v,v,v);


        movl    $0, %eax
        movq    %rax, %rdx
        sbbq    %rax, %rdx

What the code actually want is to
sbb %rdx,%rdx
Which is to set the register to the UINT64_MAX when carry (borrow) flag is 1.

        movq    (%rdx), %rcx
        movq    8(%rsi), %r9
        addq    (%rsi), %rcx
        movq    16(%rsi), %r8
        movq    24(%rdx), %rax
        adcq    8(%rdx), %r9
        movq    24(%rsi), %rsi
        adcq    16(%rdx), %r8
        adcq    %rax, %rsi
        movl    $0, %eax

Do we have any possibility to fix those optimization issues in the compiler?

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug target/99548] Help me! Lost the fight against the compiler.
  2021-03-11 11:00 [Bug rtl-optimization/99548] New: Help me! Lost the fight against the compiler unlvsur at live dot com
@ 2021-03-11 20:10 ` pinskia at gcc dot gnu.org
  2021-03-11 23:35 ` unlvsur at live dot com
  2021-12-23 21:45 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-03-11 20:10 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
          Component|rtl-optimization            |target

--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
For aarch64 we get:
        ldp     x4, x3, [x1]
        mov     x7, 38
        ldp     x6, x5, [x2]
        ldr     x8, [x1, 16]
        add     x6, x4, x6
        cmp     x4, x6
        adc     x5, x3, x5
        cmp     x3, x5
        ldr     x3, [x1, 24]
        ldp     x4, x1, [x2, 16]
        adc     x4, x8, x4
        cmp     x8, x4
        adc     x1, x3, x1
        cmp     x3, x1
        csetm   x2, cs
        and     x2, x2, x7
        add     x2, x2, x6
        cmp     x2, x6
        cinc    x6, x5, ls
        cmp     x6, x5
        cinc    x5, x4, ls
        cmp     x5, x4
        cinc    x3, x1, ls
        cmp     x3, x1
        csetm   x1, ls
        and     x1, x1, x7
        add     x1, x1, x2
        str     x1, [x0]
        cmp     x1, x2
        cinc    x1, x6, ls
        str     x1, [x0, 8]
        cmp     x1, x6
        cinc    x1, x5, ls
        cmp     x1, x5
        cinc    x3, x3, ls
        stp     x1, x3, [x0, 16]
        ret
There is only one missing optimization there really:
        csetm   x1, ls
        and     x1, x1, x7
Should be turned into:
csel x1, x7, xzw, ls
Which is repeated a few times.

It looks like x86_64 has other issues.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug target/99548] Help me! Lost the fight against the compiler.
  2021-03-11 11:00 [Bug rtl-optimization/99548] New: Help me! Lost the fight against the compiler unlvsur at live dot com
  2021-03-11 20:10 ` [Bug target/99548] " pinskia at gcc dot gnu.org
@ 2021-03-11 23:35 ` unlvsur at live dot com
  2021-12-23 21:45 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: unlvsur at live dot com @ 2021-03-11 23:35 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548

--- Comment #2 from cqwrteur <unlvsur at live dot com> ---
(In reply to Andrew Pinski from comment #1)
> For aarch64 we get:
>         ldp     x4, x3, [x1]
>         mov     x7, 38
>         ldp     x6, x5, [x2]
>         ldr     x8, [x1, 16]
>         add     x6, x4, x6
>         cmp     x4, x6
>         adc     x5, x3, x5
>         cmp     x3, x5
>         ldr     x3, [x1, 24]
>         ldp     x4, x1, [x2, 16]
>         adc     x4, x8, x4
>         cmp     x8, x4
>         adc     x1, x3, x1
>         cmp     x3, x1
>         csetm   x2, cs
>         and     x2, x2, x7
>         add     x2, x2, x6
>         cmp     x2, x6
>         cinc    x6, x5, ls
>         cmp     x6, x5
>         cinc    x5, x4, ls
>         cmp     x5, x4
>         cinc    x3, x1, ls
>         cmp     x3, x1
>         csetm   x1, ls
>         and     x1, x1, x7
>         add     x1, x1, x2
>         str     x1, [x0]
>         cmp     x1, x2
>         cinc    x1, x6, ls
>         str     x1, [x0, 8]
>         cmp     x1, x6
>         cinc    x1, x5, ls
>         cmp     x1, x5
>         cinc    x3, x3, ls
>         stp     x1, x3, [x0, 16]
>         ret
> There is only one missing optimization there really:
>         csetm   x1, ls
>         and     x1, x1, x7
> Should be turned into:
> csel x1, x7, xzw, ls
> Which is repeated a few times.
> 
> It looks like x86_64 has other issues.

my add carry implementation for none-x86_64 platforms is wrong on godbolt.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug target/99548] Help me! Lost the fight against the compiler.
  2021-03-11 11:00 [Bug rtl-optimization/99548] New: Help me! Lost the fight against the compiler unlvsur at live dot com
  2021-03-11 20:10 ` [Bug target/99548] " pinskia at gcc dot gnu.org
  2021-03-11 23:35 ` unlvsur at live dot com
@ 2021-12-23 21:45 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-12-23 21:45 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548

--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include<cstdint>
#include<array>
#if defined(_MSC_VER)
#include<intrin.h>
#elif defined(__x86_64__) || defined(__i386__)
#include<immintrin.h>
#endif
using field_number =
std::conditional_t<sizeof(std::size_t)>=8,std::array<std::uint64_t,4>,std::array<std::uint32_t,8>>;
namespace intrinsics
{
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool sub_borrow(bool borrow,T a,T b,T& out) noexcept
{
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__)
#if __cpp_lib_is_constant_evaluated >= 201811L
        if(std::is_constant_evaluated())
                return (out=a-b-borrow)>=a;
        else
#endif
        {
                if constexpr(sizeof(T)==8)
#if defined(__x86_64__)
                        return _subborrow_u64(borrow,a,b,
#if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__))
                        reinterpret_cast<unsigned long long*>(&out));
#else
                        &out);
#endif
#else
                        return (out=a-b-borrow)>=a;
#endif

                if constexpr(sizeof(T)==4)
                        return
_subborrow_u32(borrow,a,b,reinterpret_cast<std::uint32_t*>(&out));
                else if constexpr(sizeof(T)==2)
                        return
_subborrow_u16(borrow,a,b,reinterpret_cast<std::uint16_t*>(&out));
                else if constexpr(sizeof(T)==1)
                        return
_subborrow_u8(borrow,a,b,reinterpret_cast<std::uint8_t*>(&out));
        }
#else
        return (out=a-b-borrow)>=a;
#endif
}

}

template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry(bool carry,T a,T b,T& out) noexcept
{
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__)
#if __cpp_lib_is_constant_evaluated >= 201811L
        if(std::is_constant_evaluated())
                return (out=a+b+carry)<=a;
        else
#endif
        {
                if constexpr(sizeof(T)==8)
#if defined(__x86_64__)
                        return _addcarry_u64(carry,a,b,
#if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__))
                        reinterpret_cast<unsigned long long*>(&out));
#else
                        &out);
#endif
#else
                        return (out=a+b+carry)<=a;
#endif

                else if constexpr(sizeof(T)==4)
                        return
_addcarry_u32(carry,a,b,reinterpret_cast<std::uint32_t*>(&out));
                else if constexpr(sizeof(T)==2)
                        return
_addcarry_u16(carry,a,b,reinterpret_cast<std::uint16_t*>(&out));
                else if constexpr(sizeof(T)==1)
                        return
_addcarry_u8(carry,a,b,reinterpret_cast<std::uint8_t*>(&out));
        }
#else
        return (out=a+b+carry)<=a;
#endif
}

void my_asm_field_add(
        std::uint64_t* __restrict r,
        std::uint64_t const* __restrict x,
        std::uint64_t const* __restrict y) noexcept
{
        std::uint64_t r0,r1,r2,r3;
        std::uint64_t rv;
__asm__ __volatile__(R"(mov (%[x]),%[r0]
        add (%[y]),%[r0]
        mov 8(%[x]),%[r1]
        adc 8(%[y]),%[r1]
        mov 16(%[x]),%[r2]
        adc 16(%[y]),%[r2]
        mov 24(%[x]),%[r3]
        adc 24(%[y]),%[r3]
        sbb %[rv],%[rv]
        and $38,%[rv]
        add %[rv],%[r0]
        adc $0,%[r1]
        adc $0,%[r2]
        adc $0,%[r3]
        sbb %[rv],%[rv]
        and $38,%[rv]
        add %[rv],%[r0]
        mov %[r0],(%[res])
        adc $0,%[r1]
        mov %[r1],8(%[res])
        adc $0,%[r2]
        mov %[r2],16(%[res])
        adc $0,%[r3]
        mov %[r3],24(%[res]))":
[r0]"=&r"(r0),[r1]"=&r"(r1),[r2]"=&r"(r2),[r3]"=&r"(r3),[rv]"=&r"(rv):
[x]"r"(x),[y]"r"(y),[res]"r"(r):"memory","cc");
}

void intrinsics_add(std::uint64_t* __restrict f,
        std::uint64_t const* __restrict x,
        std::uint64_t const* __restrict y) noexcept
{
        using namespace intrinsics;
        using unsigned_type = field_number::value_type;
        constexpr unsigned_type zero{};
    std::uint64_t f0,f1,f2,f3;
    bool carry{add_carry(false,x[0],y[0],f0)};
    carry=add_carry(carry,x[1],y[1],f1);
    carry=add_carry(carry,x[2],y[2],f2);
    carry=add_carry(carry,x[3],y[3],f3);
    unsigned_type v=0;
    sub_borrow(carry,v,v,v);
    v&=static_cast<unsigned_type>(38);
    carry=add_carry(false,f0,v,f0);
    carry=add_carry(carry,f1,zero,f1);
    carry=add_carry(carry,f2,zero,f2);
    carry=add_carry(carry,f3,zero,f3);
    sub_borrow(carry,v,v,v);
    v&=static_cast<unsigned_type>(38);
    carry=add_carry(false,f0,v,f[0]);
    carry=add_carry(carry,f1,zero,f[1]);
    carry=add_carry(carry,f2,zero,f[2]);
    carry=add_carry(carry,f3,zero,f[3]);
}

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-12-23 21:45 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-11 11:00 [Bug rtl-optimization/99548] New: Help me! Lost the fight against the compiler unlvsur at live dot com
2021-03-11 20:10 ` [Bug target/99548] " pinskia at gcc dot gnu.org
2021-03-11 23:35 ` unlvsur at live dot com
2021-12-23 21:45 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).