public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/97437] New: builtins subcarry and addcarry still not generate the right code. Not get optimized to immediate value
@ 2020-10-15  8:28 euloanty at live dot com
  2020-10-15 11:20 ` [Bug target/97437] " jakub at gcc dot gnu.org
                   ` (9 more replies)
  0 siblings, 10 replies; 11+ messages in thread
From: euloanty at live dot com @ 2020-10-15  8:28 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97437

            Bug ID: 97437
           Summary: builtins subcarry and addcarry still not generate the
                    right code. Not get optimized to immediate value
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: euloanty at live dot com
  Target Milestone: ---

#include<cstdint>
#include<array>
#if defined(_MSC_VER)
#include<intrin.h>
#elif defined(__x86_64__) || defined(__i386__)
#include<immintrin.h>
#endif

struct field_number
{
        using value_type =
std::conditional_t<sizeof(std::size_t)>=8,std::uint64_t,std::uint32_t>;
        value_type content[32/sizeof(value_type)];
        inline constexpr value_type const& operator[](std::size_t pos) const
noexcept
        {
                return content[pos];
        }
        inline constexpr value_type& operator[](std::size_t pos) noexcept
        {
                return content[pos];
        }
};

namespace intrinsics
{
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool sub_borrow(bool borrow,T a,T b,T& out) noexcept
{
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__)
#if __cpp_lib_is_constant_evaluated >= 201811L
        if(std::is_constant_evaluated())
                return (out=a-b-borrow)>=a;
        else
#endif
        {
                if constexpr(sizeof(T)==8)
#if defined(__x86_64__)
                        return _subborrow_u64(borrow,a,b,
#if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__))
                        reinterpret_cast<unsigned long long*>(&out));
#else
                        &out);
#endif
#else
                        return (out=a-b-borrow)>=a;
#endif

                if constexpr(sizeof(T)==4)
                        return
_subborrow_u32(borrow,a,b,reinterpret_cast<std::uint32_t*>(&out));
                else if constexpr(sizeof(T)==2)
                        return
_subborrow_u16(borrow,a,b,reinterpret_cast<std::uint16_t*>(&out));
                else if constexpr(sizeof(T)==1)
                        return
_subborrow_u8(borrow,a,b,reinterpret_cast<std::uint8_t*>(&out));
        }
#else
        return (out=a-b-borrow)>=a;
#endif
}

}


field_number operator-(field_number const& x,field_number const& y) noexcept
{
        using namespace intrinsics;
        using unsigned_type = field_number::value_type;
        constexpr unsigned_type zero{};
        field_number f;
        bool borrow{sub_borrow(false,x[0],y[0],f[0])};
        borrow=sub_borrow(borrow,x[1],y[1],f[1]);
        borrow=sub_borrow(borrow,x[2],y[2],f[2]);
        borrow=sub_borrow(borrow,x[3],y[3],f[3]);
        unsigned_type v{};
        sub_borrow(borrow,v,v,v);
        v&=static_cast<unsigned_type>(38);
        borrow=sub_borrow(false,f[0],v,f[0]);
        borrow=sub_borrow(borrow,f[1],zero,f[1]);
        borrow=sub_borrow(borrow,f[2],zero,f[2]);
        borrow=sub_borrow(borrow,f[3],zero,f[3]);
        sub_borrow(borrow,v,v,v);
        v&=static_cast<unsigned_type>(38);
        borrow=sub_borrow(false,f[0],v,f[0]);
        borrow=sub_borrow(borrow,f[1],zero,f[1]);
        borrow=sub_borrow(borrow,f[2],zero,f[2]);
        borrow=sub_borrow(borrow,f[2],zero,f[3]);
        return f;
}

https://godbolt.org/z/xM8xef

operator-(field_number const&, field_number const&):
        movq    (%rsi), %r9
        subq    (%rdx), %r9
        movq    %rdi, %r8
        movq    %rdx, %rax
        movq    %r9, (%rdi)
        movq    8(%rsi), %rdi
        sbbq    8(%rdx), %rdi
        movq    %rdi, 8(%r8)
        movq    16(%rsi), %rdx
        sbbq    16(%rax), %rdx
        movq    %rdx, 16(%r8)
        movq    24(%rax), %rax
        movq    24(%rsi), %rsi
        sbbq    %rax, %rsi

//Here is an output dependency. No need movl 0 to %eax.
        movl    $0, %eax
        movq    %rax, %rcx
        sbbq    %rax, %rcx
        andl    $38, %ecx
        subq    %rcx, %r9
        sbbq    %rax, %rdi// why sbbq %rax,%rdi instead of sbbq 0 %rdi ????
//The %rax register should not get allocated or used in GCC
        sbbq    %rax, %rdx
        sbbq    %rax, %rsi
        sbbq    %rcx, %rcx
        andl    $38, %ecx
        subq    %rcx, %r9
        sbbq    %rax, %rdi
        movq    %r9, (%r8)
        sbbq    %rax, %rdx
        movq    %rdi, 8(%r8)
        movq    %rdx, 16(%r8)
        sbbq    %rax, %rdx
        movq    %r8, %rax
        movq    %rdx, 24(%r8)
        ret


The assembly GCC generated is still worse than clang. although clang does not
generate the optimal one either.

The subborrow instruction in GCC does not get optimized as immediate value

The "correct" assembly it generates should be like what clang generates (you
can use different registers no problem) minus that xorl    %ecx, %ecx clean up
instruction.


operator-(field_number const&, field_number const&):                #
@operator-(field_number const&, field_number const&)
        movq    %rdi, %rax
        movq    (%rsi), %r8
        subq    (%rdx), %r8
        movq    8(%rsi), %r9
        sbbq    8(%rdx), %r9
        movq    16(%rsi), %rdi
        sbbq    16(%rdx), %rdi
        movq    24(%rsi), %rsi
        sbbq    24(%rdx), %rsi
        sbbq    %rcx, %rcx
        andl    $38, %ecx
        subq    %rcx, %r8
        sbbq    $0, %r9
        sbbq    $0, %rdi
        sbbq    $0, %rsi
        sbbq    %rcx, %rcx
        andl    $38, %ecx
        subq    %rcx, %r8
        sbbq    $0, %r9
        movq    %r8, (%rax)
        movq    %r9, 8(%rax)
        sbbq    $0, %rdi
        movq    %rdi, 16(%rax)
        sbbq    $0, %rdi
        movq    %rdi, 24(%rax)
        retq

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-10-15 17:00 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-15  8:28 [Bug rtl-optimization/97437] New: builtins subcarry and addcarry still not generate the right code. Not get optimized to immediate value euloanty at live dot com
2020-10-15 11:20 ` [Bug target/97437] " jakub at gcc dot gnu.org
2020-10-15 12:29 ` jakub at gcc dot gnu.org
2020-10-15 12:52 ` jakub at gcc dot gnu.org
2020-10-15 13:47 ` euloanty at live dot com
2020-10-15 13:57 ` segher at gcc dot gnu.org
2020-10-15 13:59 ` segher at gcc dot gnu.org
2020-10-15 14:00 ` jakub at gcc dot gnu.org
2020-10-15 15:14 ` segher at gcc dot gnu.org
2020-10-15 15:20 ` jakub at gcc dot gnu.org
2020-10-15 17:00 ` segher at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).