public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos
@ 2021-10-27 21:17 unlvsur at live dot com
2021-10-27 21:18 ` [Bug tree-optimization/102974] " pinskia at gcc dot gnu.org
` (15 more replies)
0 siblings, 16 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 21:17 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
Bug ID: 102974
Summary: GCC optimization is very poor for add carry and
multiplication combos
Product: gcc
Version: 12.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: unlvsur at live dot com
Target Milestone: ---
GCC:
https://godbolt.org/z/6sc6v5YcG
clang:
https://godbolt.org/z/eP8fTrWzd
msvc:
https://godbolt.org/z/snzoEe5he
GCC generates 16 more instructions than msvc and clang for carry flag
optimizations.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug tree-optimization/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
@ 2021-10-27 21:18 ` pinskia at gcc dot gnu.org
2021-10-27 21:19 ` [Bug target/102974] " pinskia at gcc dot gnu.org
` (14 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-10-27 21:18 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include<cstdint>
#include<bit>
struct ul32x2
{
std::uint_least32_t low,high;
};
inline constexpr std::uint_least32_t umul_least_32(std::uint_least32_t
a,std::uint_least32_t b,std::uint_least32_t& high) noexcept
{
if
constexpr(std::endian::native==std::endian::little||std::endian::native==std::endian::big)
{
struct ul32x2_little_endian_t
{
std::uint_least32_t low,high;
};
struct ul32x2_big_endian_t
{
std::uint_least32_t high,low;
};
using ul32x2_t =
std::conditional_t<std::endian::native==std::endian::little,ul32x2_little_endian_t,ul32x2_big_endian_t>;
auto
ret{__builtin_bit_cast(ul32x2_t,static_cast<std::uint_least64_t>(a)*b)};
high=ret.high;
return ret.low;
}
else
{
std::uint_least64_t v{static_cast<std::uint_least64_t>(a)*b};
high=static_cast<std::uint_least32_t>(v>>32u);
return static_cast<std::uint_least32_t>(v);
}
}
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry_naive(bool carry,T a,T b,T& out) noexcept
{
T temp{carry+a};
out=temp+b;
return (out < b) | (temp < a);
}
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry(bool carry,T a,T b,T& out) noexcept
{
#if __cpp_lib_is_constant_evaluated >= 201811L
if(std::is_constant_evaluated())
return add_carry_naive(carry,a,b,out);
else
#endif
{
#if defined(_MSC_VER) && !defined(__clang__)
#if (defined(_M_IX86) || defined(_M_AMD64))
if constexpr(sizeof(T)==8)
{
#if defined(_M_AMD64)
return
_addcarryx_u64(carry,a,b,reinterpret_cast<std::uint64_t*>(__builtin_addressof(out)));
#else
return _addcarryx_u32(_addcarryx_u32(carry,
*reinterpret_cast<std::uint32_t*>(__builtin_addressof(a)),*reinterpret_cast<std::uint32_t*>(__builtin_addressof(b)),reinterpret_cast<std::uint32_t*>(__builtin_addressof(out))),
reinterpret_cast<std::uint32_t*>(__builtin_addressof(a))[1],reinterpret_cast<std::uint32_t*>(__builtin_addressof(b))[1],reinterpret_cast<std::uint32_t*>(__builtin_addressof(out))+1);
#endif
}
else if constexpr(sizeof(T)==4)
return
_addcarryx_u32(carry,a,b,reinterpret_cast<std::uint32_t*>(__builtin_addressof(out)));
else if constexpr(sizeof(T)==2)
return _addcarry_u16(carry,a,b,reinterpret_cast<short
unsigned*>(__builtin_addressof(out)));
else if constexpr(sizeof(T)==1)
return _addcarry_u8(carry,a,b,reinterpret_cast<char
unsigned*>(__builtin_addressof(out)));
else
return add_carry_naive(carry,a,b,out);
#else
return add_carry_naive(carry,a,b,out);
#endif
#elif defined(__has_builtin) &&
(__has_builtin(__builtin_addcb)&&__has_builtin(__builtin_addcs)&&__has_builtin(__builtin_addc)&&__has_builtin(__builtin_addcl)&&__has_builtin(__builtin_addcll))
if constexpr(sizeof(T)==sizeof(long long unsigned))
{
long long unsigned carryout;
out=__builtin_addcll(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else if constexpr(sizeof(T)==sizeof(long unsigned))
{
long unsigned carryout;
out=__builtin_addcl(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else if constexpr(sizeof(T)==sizeof(unsigned))
{
unsigned carryout;
out=__builtin_addc(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else if constexpr(sizeof(T)==sizeof(short unsigned))
{
short unsigned carryout;
out=__builtin_addcs(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else if constexpr(sizeof(T)==sizeof(char unsigned))
{
char unsigned carryout;
out=__builtin_addcb(a,b,carry,__builtin_addressof(carryout));
return carryout;
}
else
{
return add_carry_naive(carry,a,b,out);
}
#elif defined(__has_builtin) &&
(__has_builtin(__builtin_ia32_addcarryx_u32)||__has_builtin(__builtin_ia32_addcarry_u32)||__has_builtin(__builtin_ia32_addcarryx_u64))
if constexpr(sizeof(T)==8)
{
#if __has_builtin(__builtin_ia32_addcarryx_u64)
using may_alias_ptr_type [[gnu::may_alias]] = unsigned long
long*;
return
__builtin_ia32_addcarryx_u64(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#else
std::uint32_t a_low;
std::uint32_t a_high;
__builtin_memcpy(__builtin_addressof(a_low),__builtin_addressof(a),4);
__builtin_memcpy(__builtin_addressof(a_high),reinterpret_cast<char
const*>(__builtin_addressof(a))+4,4);
std::uint32_t b_low;
std::uint32_t b_high;
__builtin_memcpy(__builtin_addressof(b_low),__builtin_addressof(b),4);
__builtin_memcpy(__builtin_addressof(b_high),reinterpret_cast<char
const*>(__builtin_addressof(b))+4,4);
using may_alias_ptr_type [[gnu::may_alias]] = unsigned*;
#if __has_builtin(__builtin_ia32_addcarry_u32)
return
__builtin_ia32_addcarry_u32(__builtin_ia32_addcarry_u32(carry,a_low,b_low,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))),
a_high,b_high,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))+1);
#elif __has_builtin(__builtin_ia32_addcarryx_u32)
return
__builtin_ia32_addcarryx_u32(__builtin_ia32_addcarryx_u32(carry,a_low,b_low,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))),
a_high,b_high,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))+1);
#else
return add_carry_naive(carry,a,b,out);
#endif
#endif
}
else if constexpr(sizeof(T)==4)
{
using may_alias_ptr_type [[gnu::may_alias]] = unsigned*;
#if __has_builtin(__builtin_ia32_addcarry_u32)
return
__builtin_ia32_addcarry_u32(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#elif __has_builtin(__builtin_ia32_addcarryx_u32)
return
__builtin_ia32_addcarryx_u32(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#else
return add_carry_naive(carry,a,b,out);
#endif
}
else
return add_carry_naive(carry,a,b,out); //16 bit addcarry
simply does not exist on gcc and clang
#else
return add_carry_naive(carry,a,b,out);
#endif
}
}
std::uint_least64_t umul_least_64(std::uint_least64_t a,std::uint_least64_t
b,std::uint_least64_t& high) noexcept
{
auto [a0,a1]=__builtin_bit_cast(ul32x2,a);
auto [b0,b1]=__builtin_bit_cast(ul32x2,b);
std::uint_least32_t c1;
std::uint_least32_t c0{umul_least_32(a0,b0,c1)};
std::uint_least32_t a0b1h;
std::uint_least32_t a0b1l{umul_least_32(a0,b1,a0b1h)};
std::uint_least32_t a1b0h;
std::uint_least32_t a1b0l{umul_least_32(a1,b0,a1b0h)};
std::uint_least32_t c3;
std::uint_least32_t c2{umul_least_32(a1,b1,c3)};
bool carry{add_carry(false,c1,a0b1l,c1)};
carry=add_carry(carry,a0b1h,c2,c2);
std::uint_least32_t temp{carry};
carry=add_carry(false,c1,a1b0l,c1);
carry=add_carry(carry,a1b0h,c2,c2);
add_carry(carry,temp,c3,c3);
high=__builtin_bit_cast(std::uint_least64_t,ul32x2{c2,c3});
return __builtin_bit_cast(std::uint_least64_t,ul32x2{c0,c1});
}
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
2021-10-27 21:18 ` [Bug tree-optimization/102974] " pinskia at gcc dot gnu.org
@ 2021-10-27 21:19 ` pinskia at gcc dot gnu.org
2021-10-27 21:22 ` unlvsur at live dot com
` (13 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-10-27 21:19 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Component|tree-optimization |target
--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
There might be another bug about _addcarryx_u64 already.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
2021-10-27 21:18 ` [Bug tree-optimization/102974] " pinskia at gcc dot gnu.org
2021-10-27 21:19 ` [Bug target/102974] " pinskia at gcc dot gnu.org
@ 2021-10-27 21:22 ` unlvsur at live dot com
2021-10-27 21:23 ` unlvsur at live dot com
` (12 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 21:22 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #3 from cqwrteur <unlvsur at live dot com> ---
(In reply to Andrew Pinski from comment #2)
> There might be another bug about _addcarryx_u64 already.
This is 32 bit addcarry.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (2 preceding siblings ...)
2021-10-27 21:22 ` unlvsur at live dot com
@ 2021-10-27 21:23 ` unlvsur at live dot com
2021-10-27 21:24 ` pinskia at gcc dot gnu.org
` (11 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 21:23 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #4 from cqwrteur <unlvsur at live dot com> ---
(In reply to cqwrteur from comment #3)
> (In reply to Andrew Pinski from comment #2)
> > There might be another bug about _addcarryx_u64 already.
>
> This is 32 bit addcarry.
but yeah. GCC does not perform optimizations very well to add carries and mul +
recognize >>64u <<64u patterns
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (3 preceding siblings ...)
2021-10-27 21:23 ` unlvsur at live dot com
@ 2021-10-27 21:24 ` pinskia at gcc dot gnu.org
2021-10-27 22:39 ` unlvsur at live dot com
` (10 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-10-27 21:24 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #5 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to cqwrteur from comment #4)
> (In reply to cqwrteur from comment #3)
> > (In reply to Andrew Pinski from comment #2)
> > > There might be another bug about _addcarryx_u64 already.
> >
> > This is 32 bit addcarry.
>
> but yeah. GCC does not perform optimizations very well to add carries and
> mul + recognize >>64u <<64u patterns
I mean all of _addcarryx_* intrinsics.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (4 preceding siblings ...)
2021-10-27 21:24 ` pinskia at gcc dot gnu.org
@ 2021-10-27 22:39 ` unlvsur at live dot com
2021-10-27 23:02 ` unlvsur at live dot com
` (9 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 22:39 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #6 from cqwrteur <unlvsur at live dot com> ---
(In reply to Andrew Pinski from comment #5)
> (In reply to cqwrteur from comment #4)
> > (In reply to cqwrteur from comment #3)
> > > (In reply to Andrew Pinski from comment #2)
> > > > There might be another bug about _addcarryx_u64 already.
> > >
> > > This is 32 bit addcarry.
> >
> > but yeah. GCC does not perform optimizations very well to add carries and
> > mul + recognize >>64u <<64u patterns
>
> I mean all of _addcarryx_* intrinsics.
https://godbolt.org/z/qq3nb49Eq
https://godbolt.org/z/cqoYG35jx
Also this is weird. just extract part of code into function generates different
assembly for __builtin_bit_cast. It must be a inliner bug.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (5 preceding siblings ...)
2021-10-27 22:39 ` unlvsur at live dot com
@ 2021-10-27 23:02 ` unlvsur at live dot com
2021-10-28 8:20 ` rguenth at gcc dot gnu.org
` (8 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 23:02 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #7 from cqwrteur <unlvsur at live dot com> ---
(In reply to cqwrteur from comment #6)
> (In reply to Andrew Pinski from comment #5)
> > (In reply to cqwrteur from comment #4)
> > > (In reply to cqwrteur from comment #3)
> > > > (In reply to Andrew Pinski from comment #2)
> > > > > There might be another bug about _addcarryx_u64 already.
> > > >
> > > > This is 32 bit addcarry.
> > >
> > > but yeah. GCC does not perform optimizations very well to add carries and
> > > mul + recognize >>64u <<64u patterns
> >
> > I mean all of _addcarryx_* intrinsics.
>
> https://godbolt.org/z/qq3nb49Eq
> https://godbolt.org/z/cqoYG35jx
> Also this is weird. just extract part of code into function generates
> different assembly for __builtin_bit_cast. It must be a inliner bug.
my fault for misreading(In reply to Andrew Pinski from comment #5)
> (In reply to cqwrteur from comment #4)
> > (In reply to cqwrteur from comment #3)
> > > (In reply to Andrew Pinski from comment #2)
> > > > There might be another bug about _addcarryx_u64 already.
> > >
> > > This is 32 bit addcarry.
> >
> > but yeah. GCC does not perform optimizations very well to add carries and
> > mul + recognize >>64u <<64u patterns
>
> I mean all of _addcarryx_* intrinsics.
This example is also interesting that -O2, -O3, -Ofast generates much worse
assembly than -O1. There is no point for doing SIMD for things like this
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (6 preceding siblings ...)
2021-10-27 23:02 ` unlvsur at live dot com
@ 2021-10-28 8:20 ` rguenth at gcc dot gnu.org
2021-10-28 9:56 ` unlvsur at live dot com
` (7 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-10-28 8:20 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #8 from Richard Biener <rguenth at gcc dot gnu.org> ---
As mentioned in the other bug STV might make things only worse.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (7 preceding siblings ...)
2021-10-28 8:20 ` rguenth at gcc dot gnu.org
@ 2021-10-28 9:56 ` unlvsur at live dot com
2021-11-01 10:07 ` marxin at gcc dot gnu.org
` (6 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-28 9:56 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #9 from cqwrteur <unlvsur at live dot com> ---
(In reply to Richard Biener from comment #8)
> As mentioned in the other bug STV might make things only worse.
what is stv?
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (8 preceding siblings ...)
2021-10-28 9:56 ` unlvsur at live dot com
@ 2021-11-01 10:07 ` marxin at gcc dot gnu.org
2023-06-03 23:53 ` slash.tmp at free dot fr
` (5 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: marxin at gcc dot gnu.org @ 2021-11-01 10:07 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
Martin Liška <marxin at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |marxin at gcc dot gnu.org
--- Comment #10 from Martin Liška <marxin at gcc dot gnu.org> ---
(In reply to cqwrteur from comment #9)
> (In reply to Richard Biener from comment #8)
> > As mentioned in the other bug STV might make things only worse.
>
> what is stv?
$ gcc --help=target | grep mstv
-mstv Disable Scalar to Vector optimization pass
transforming 64-bit integer computations into a vector ones.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (9 preceding siblings ...)
2021-11-01 10:07 ` marxin at gcc dot gnu.org
@ 2023-06-03 23:53 ` slash.tmp at free dot fr
2023-06-06 6:56 ` slash.tmp at free dot fr
` (4 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: slash.tmp at free dot fr @ 2023-06-03 23:53 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #11 from Mason <slash.tmp at free dot fr> ---
Here's umul_least_64() rewritten as mul_64x64_128() in C
typedef unsigned int u32;
typedef unsigned long long u64;
/* u32 acc[3], a[1], b[1] */
static void mul_add_32x32(u32 *acc, const u32 *a, const u32 *b)
{
u64 res = (u64)a[0] * b[0];
u32 lo = res, hi = res >> 32;
asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]\n\t" "adc $0, %[D2]" :
[D0] "+m" (acc[0]), [D1] "+m" (acc[1]), [D2] "+m" (acc[2]) :
[LO] "r" (lo), [HI] "r" (hi) : "cc");
}
/* u32 acc[5], a[2], b[2] */
void mul_64x64_128(u32 *acc, const u32 *a, const u32 *b)
{
mul_add_32x32(acc+0, a+0, b+0);
mul_add_32x32(acc+1, a+0, b+1);
mul_add_32x32(acc+1, a+1, b+0);
mul_add_32x32(acc+2, a+1, b+1);
}
gcc-trunk -O3 -m32
mul_64x64_128:
pushl %esi
pushl %ebx
movl 16(%esp), %ebx ; ebx = a
movl 20(%esp), %esi ; esi = b
movl 12(%esp), %ecx ; ecx = acc
movl (%esi), %eax ; b0
mull (%ebx) ; a0*b0
add %eax, (%ecx)
adc %edx, 4(%ecx)
adc $0, 8(%ecx)
movl 4(%esi), %eax ; b1
mull (%ebx) ; a0*b1
add %eax, 4(%ecx)
adc %edx, 8(%ecx)
adc $0, 12(%ecx)
movl (%esi), %eax ; b0
mull 4(%ebx) ; a1*b0
add %eax, 4(%ecx)
adc %edx, 8(%ecx)
adc $0, 12(%ecx)
movl 4(%esi), %eax ; b1
mull 4(%ebx) ; a1*b1
add %eax, 8(%ecx)
adc %edx, 12(%ecx)
adc $0, 16(%ecx)
popl %ebx
popl %esi
ret
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (10 preceding siblings ...)
2023-06-03 23:53 ` slash.tmp at free dot fr
@ 2023-06-06 6:56 ` slash.tmp at free dot fr
2023-06-06 7:01 ` unlvsur at live dot com
` (3 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: slash.tmp at free dot fr @ 2023-06-06 6:56 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #12 from Mason <slash.tmp at free dot fr> ---
Actually, in this case, we don't need to propagate the carry over 3 limbs.
typedef unsigned int u32;
typedef unsigned long long u64;
/* u32 acc[2], a[1], b[1] */
static void mul_add_32x32(u32 *acc, const u32 *a, const u32 *b)
{
u64 res = (u64)a[0] * b[0];
u32 lo = res, hi = res >> 32;
asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]" :
[D0] "+m" (acc[0]), [D1] "+m" (acc[1]) :
[LO] "r" (lo), [HI] "r" (hi) : "cc");
}
/* u32 acc[4], a[2], b[2] */
void mul_64x64_128(u32 *acc, const u32 *a, const u32 *b)
{
mul_add_32x32(acc+0, a+0, b+0);
mul_add_32x32(acc+1, a+0, b+1);
mul_add_32x32(acc+1, a+1, b+0);
mul_add_32x32(acc+2, a+1, b+1);
}
gcc-trunk -O3 -m32
mul_64x64_128:
pushl %esi
pushl %ebx
movl 16(%esp), %ebx
movl 20(%esp), %esi
movl 12(%esp), %ecx
movl (%esi), %eax
mull (%ebx)
add %eax, (%ecx)
adc %edx, 4(%ecx)
movl 4(%esi), %eax
mull (%ebx)
add %eax, 4(%ecx)
adc %edx, 8(%ecx)
movl (%esi), %eax
mull 4(%ebx)
add %eax, 4(%ecx)
adc %edx, 8(%ecx)
movl 4(%esi), %eax
mull 4(%ebx)
add %eax, 8(%ecx)
adc %edx, 12(%ecx)
popl %ebx
popl %esi
ret
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (11 preceding siblings ...)
2023-06-06 6:56 ` slash.tmp at free dot fr
@ 2023-06-06 7:01 ` unlvsur at live dot com
2023-06-06 7:02 ` unlvsur at live dot com
` (2 subsequent siblings)
15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2023-06-06 7:01 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #13 from cqwrteur <unlvsur at live dot com> ---
Hi, the problem comes out GCC does not do a very good job to deal with crypto
computations that usually exploit all sorts of patterns.
template<typename T>
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
T res{a+b};
carryout=res<a;
return res;
}
template<typename T>
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
assume(carryout==0||carryout==1);
a=add_carry_no_carry_in(carryin,a,carryout);
a=add_carry_no_carry_in(a,b,carryin);
carryout+=carryin;
return a;
}
See this pattern
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106865
I suggest just adding this addc pattern to GCC instead of adding a built-in
like clang. This can improve the existing code. It is, however, needed for
adding a backend hook for dealing with it.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (12 preceding siblings ...)
2023-06-06 7:01 ` unlvsur at live dot com
@ 2023-06-06 7:02 ` unlvsur at live dot com
2023-06-06 7:05 ` unlvsur at live dot com
2023-06-06 17:19 ` slash.tmp at free dot fr
15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2023-06-06 7:02 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #14 from cqwrteur <unlvsur at live dot com> ---
template<typename T>
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
T res{a+b};
carryout=res<a;
return res;
}
template<typename T>
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
assume(carryin==0||carryin==1);
a=add_carry_no_carry_in(carryin,a,carryout);
a=add_carry_no_carry_in(a,b,carryin);
carryout+=carryin;
assume(carryout==0||carryout==1);
return a;
}
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (13 preceding siblings ...)
2023-06-06 7:02 ` unlvsur at live dot com
@ 2023-06-06 7:05 ` unlvsur at live dot com
2023-06-06 17:19 ` slash.tmp at free dot fr
15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2023-06-06 7:05 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #15 from cqwrteur <unlvsur at live dot com> ---
template<::std::unsigned_integral T>
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
T res{a+b};
carryout=res<a;
return res;
}
template<::std::unsigned_integral T>
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
assume(carryin==0||carryin==1);
a=add_carry_no_carry_in(carryin,a,carryout);
a=add_carry_no_carry_in(a,b,carryin);
carryout+=carryin;
assume(carryout==0||carryout==1);
return a;
}
^ permalink raw reply [flat|nested] 17+ messages in thread
* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
` (14 preceding siblings ...)
2023-06-06 7:05 ` unlvsur at live dot com
@ 2023-06-06 17:19 ` slash.tmp at free dot fr
15 siblings, 0 replies; 17+ messages in thread
From: slash.tmp at free dot fr @ 2023-06-06 17:19 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #16 from Mason <slash.tmp at free dot fr> ---
For the record, the example I provided was intended to show that, with some
help, GCC can generate good code for bigint multiplication. In this situation,
"help" means a short asm template.
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2023-06-06 17:19 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
2021-10-27 21:18 ` [Bug tree-optimization/102974] " pinskia at gcc dot gnu.org
2021-10-27 21:19 ` [Bug target/102974] " pinskia at gcc dot gnu.org
2021-10-27 21:22 ` unlvsur at live dot com
2021-10-27 21:23 ` unlvsur at live dot com
2021-10-27 21:24 ` pinskia at gcc dot gnu.org
2021-10-27 22:39 ` unlvsur at live dot com
2021-10-27 23:02 ` unlvsur at live dot com
2021-10-28 8:20 ` rguenth at gcc dot gnu.org
2021-10-28 9:56 ` unlvsur at live dot com
2021-11-01 10:07 ` marxin at gcc dot gnu.org
2023-06-03 23:53 ` slash.tmp at free dot fr
2023-06-06 6:56 ` slash.tmp at free dot fr
2023-06-06 7:01 ` unlvsur at live dot com
2023-06-06 7:02 ` unlvsur at live dot com
2023-06-06 7:05 ` unlvsur at live dot com
2023-06-06 17:19 ` slash.tmp at free dot fr
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).