* [Bug target/99548] Help me! Lost the fight against the compiler.
2021-03-11 11:00 [Bug rtl-optimization/99548] New: Help me! Lost the fight against the compiler unlvsur at live dot com
@ 2021-03-11 20:10 ` pinskia at gcc dot gnu.org
2021-03-11 23:35 ` unlvsur at live dot com
2021-12-23 21:45 ` pinskia at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-03-11 20:10 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Component|rtl-optimization |target
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
For aarch64 we get:
ldp x4, x3, [x1]
mov x7, 38
ldp x6, x5, [x2]
ldr x8, [x1, 16]
add x6, x4, x6
cmp x4, x6
adc x5, x3, x5
cmp x3, x5
ldr x3, [x1, 24]
ldp x4, x1, [x2, 16]
adc x4, x8, x4
cmp x8, x4
adc x1, x3, x1
cmp x3, x1
csetm x2, cs
and x2, x2, x7
add x2, x2, x6
cmp x2, x6
cinc x6, x5, ls
cmp x6, x5
cinc x5, x4, ls
cmp x5, x4
cinc x3, x1, ls
cmp x3, x1
csetm x1, ls
and x1, x1, x7
add x1, x1, x2
str x1, [x0]
cmp x1, x2
cinc x1, x6, ls
str x1, [x0, 8]
cmp x1, x6
cinc x1, x5, ls
cmp x1, x5
cinc x3, x3, ls
stp x1, x3, [x0, 16]
ret
There is only one missing optimization there really:
csetm x1, ls
and x1, x1, x7
Should be turned into:
csel x1, x7, xzw, ls
Which is repeated a few times.
It looks like x86_64 has other issues.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/99548] Help me! Lost the fight against the compiler.
2021-03-11 11:00 [Bug rtl-optimization/99548] New: Help me! Lost the fight against the compiler unlvsur at live dot com
2021-03-11 20:10 ` [Bug target/99548] " pinskia at gcc dot gnu.org
@ 2021-03-11 23:35 ` unlvsur at live dot com
2021-12-23 21:45 ` pinskia at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: unlvsur at live dot com @ 2021-03-11 23:35 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548
--- Comment #2 from cqwrteur <unlvsur at live dot com> ---
(In reply to Andrew Pinski from comment #1)
> For aarch64 we get:
> ldp x4, x3, [x1]
> mov x7, 38
> ldp x6, x5, [x2]
> ldr x8, [x1, 16]
> add x6, x4, x6
> cmp x4, x6
> adc x5, x3, x5
> cmp x3, x5
> ldr x3, [x1, 24]
> ldp x4, x1, [x2, 16]
> adc x4, x8, x4
> cmp x8, x4
> adc x1, x3, x1
> cmp x3, x1
> csetm x2, cs
> and x2, x2, x7
> add x2, x2, x6
> cmp x2, x6
> cinc x6, x5, ls
> cmp x6, x5
> cinc x5, x4, ls
> cmp x5, x4
> cinc x3, x1, ls
> cmp x3, x1
> csetm x1, ls
> and x1, x1, x7
> add x1, x1, x2
> str x1, [x0]
> cmp x1, x2
> cinc x1, x6, ls
> str x1, [x0, 8]
> cmp x1, x6
> cinc x1, x5, ls
> cmp x1, x5
> cinc x3, x3, ls
> stp x1, x3, [x0, 16]
> ret
> There is only one missing optimization there really:
> csetm x1, ls
> and x1, x1, x7
> Should be turned into:
> csel x1, x7, xzw, ls
> Which is repeated a few times.
>
> It looks like x86_64 has other issues.
my add carry implementation for none-x86_64 platforms is wrong on godbolt.
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/99548] Help me! Lost the fight against the compiler.
2021-03-11 11:00 [Bug rtl-optimization/99548] New: Help me! Lost the fight against the compiler unlvsur at live dot com
2021-03-11 20:10 ` [Bug target/99548] " pinskia at gcc dot gnu.org
2021-03-11 23:35 ` unlvsur at live dot com
@ 2021-12-23 21:45 ` pinskia at gcc dot gnu.org
2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-12-23 21:45 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548
--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include<cstdint>
#include<array>
#if defined(_MSC_VER)
#include<intrin.h>
#elif defined(__x86_64__) || defined(__i386__)
#include<immintrin.h>
#endif
using field_number =
std::conditional_t<sizeof(std::size_t)>=8,std::array<std::uint64_t,4>,std::array<std::uint32_t,8>>;
namespace intrinsics
{
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool sub_borrow(bool borrow,T a,T b,T& out) noexcept
{
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__)
#if __cpp_lib_is_constant_evaluated >= 201811L
if(std::is_constant_evaluated())
return (out=a-b-borrow)>=a;
else
#endif
{
if constexpr(sizeof(T)==8)
#if defined(__x86_64__)
return _subborrow_u64(borrow,a,b,
#if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__))
reinterpret_cast<unsigned long long*>(&out));
#else
&out);
#endif
#else
return (out=a-b-borrow)>=a;
#endif
if constexpr(sizeof(T)==4)
return
_subborrow_u32(borrow,a,b,reinterpret_cast<std::uint32_t*>(&out));
else if constexpr(sizeof(T)==2)
return
_subborrow_u16(borrow,a,b,reinterpret_cast<std::uint16_t*>(&out));
else if constexpr(sizeof(T)==1)
return
_subborrow_u8(borrow,a,b,reinterpret_cast<std::uint8_t*>(&out));
}
#else
return (out=a-b-borrow)>=a;
#endif
}
}
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry(bool carry,T a,T b,T& out) noexcept
{
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__)
#if __cpp_lib_is_constant_evaluated >= 201811L
if(std::is_constant_evaluated())
return (out=a+b+carry)<=a;
else
#endif
{
if constexpr(sizeof(T)==8)
#if defined(__x86_64__)
return _addcarry_u64(carry,a,b,
#if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__))
reinterpret_cast<unsigned long long*>(&out));
#else
&out);
#endif
#else
return (out=a+b+carry)<=a;
#endif
else if constexpr(sizeof(T)==4)
return
_addcarry_u32(carry,a,b,reinterpret_cast<std::uint32_t*>(&out));
else if constexpr(sizeof(T)==2)
return
_addcarry_u16(carry,a,b,reinterpret_cast<std::uint16_t*>(&out));
else if constexpr(sizeof(T)==1)
return
_addcarry_u8(carry,a,b,reinterpret_cast<std::uint8_t*>(&out));
}
#else
return (out=a+b+carry)<=a;
#endif
}
void my_asm_field_add(
std::uint64_t* __restrict r,
std::uint64_t const* __restrict x,
std::uint64_t const* __restrict y) noexcept
{
std::uint64_t r0,r1,r2,r3;
std::uint64_t rv;
__asm__ __volatile__(R"(mov (%[x]),%[r0]
add (%[y]),%[r0]
mov 8(%[x]),%[r1]
adc 8(%[y]),%[r1]
mov 16(%[x]),%[r2]
adc 16(%[y]),%[r2]
mov 24(%[x]),%[r3]
adc 24(%[y]),%[r3]
sbb %[rv],%[rv]
and $38,%[rv]
add %[rv],%[r0]
adc $0,%[r1]
adc $0,%[r2]
adc $0,%[r3]
sbb %[rv],%[rv]
and $38,%[rv]
add %[rv],%[r0]
mov %[r0],(%[res])
adc $0,%[r1]
mov %[r1],8(%[res])
adc $0,%[r2]
mov %[r2],16(%[res])
adc $0,%[r3]
mov %[r3],24(%[res]))":
[r0]"=&r"(r0),[r1]"=&r"(r1),[r2]"=&r"(r2),[r3]"=&r"(r3),[rv]"=&r"(rv):
[x]"r"(x),[y]"r"(y),[res]"r"(r):"memory","cc");
}
void intrinsics_add(std::uint64_t* __restrict f,
std::uint64_t const* __restrict x,
std::uint64_t const* __restrict y) noexcept
{
using namespace intrinsics;
using unsigned_type = field_number::value_type;
constexpr unsigned_type zero{};
std::uint64_t f0,f1,f2,f3;
bool carry{add_carry(false,x[0],y[0],f0)};
carry=add_carry(carry,x[1],y[1],f1);
carry=add_carry(carry,x[2],y[2],f2);
carry=add_carry(carry,x[3],y[3],f3);
unsigned_type v=0;
sub_borrow(carry,v,v,v);
v&=static_cast<unsigned_type>(38);
carry=add_carry(false,f0,v,f0);
carry=add_carry(carry,f1,zero,f1);
carry=add_carry(carry,f2,zero,f2);
carry=add_carry(carry,f3,zero,f3);
sub_borrow(carry,v,v,v);
v&=static_cast<unsigned_type>(38);
carry=add_carry(false,f0,v,f[0]);
carry=add_carry(carry,f1,zero,f[1]);
carry=add_carry(carry,f2,zero,f[2]);
carry=add_carry(carry,f3,zero,f[3]);
}
^ permalink raw reply [flat|nested] 4+ messages in thread