public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos
@ 2021-10-27 21:17 unlvsur at live dot com
  2021-10-27 21:18 ` [Bug tree-optimization/102974] " pinskia at gcc dot gnu.org
                   ` (15 more replies)
  0 siblings, 16 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 21:17 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

            Bug ID: 102974
           Summary: GCC optimization is very poor for add carry and
                    multiplication combos
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: unlvsur at live dot com
  Target Milestone: ---

GCC:
https://godbolt.org/z/6sc6v5YcG

clang:
https://godbolt.org/z/eP8fTrWzd
msvc:
https://godbolt.org/z/snzoEe5he
GCC generates 16 more instructions than msvc and clang for carry flag
optimizations.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug tree-optimization/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
@ 2021-10-27 21:18 ` pinskia at gcc dot gnu.org
  2021-10-27 21:19 ` [Bug target/102974] " pinskia at gcc dot gnu.org
                   ` (14 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-10-27 21:18 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include<cstdint>
#include<bit>

struct ul32x2
{
    std::uint_least32_t low,high;
};

inline constexpr std::uint_least32_t umul_least_32(std::uint_least32_t
a,std::uint_least32_t b,std::uint_least32_t& high) noexcept
{
        if
constexpr(std::endian::native==std::endian::little||std::endian::native==std::endian::big)
        {
                struct ul32x2_little_endian_t
                {
                        std::uint_least32_t low,high;
                };
                struct ul32x2_big_endian_t
                {
                        std::uint_least32_t high,low;
                };
                using ul32x2_t =
std::conditional_t<std::endian::native==std::endian::little,ul32x2_little_endian_t,ul32x2_big_endian_t>;
                auto
ret{__builtin_bit_cast(ul32x2_t,static_cast<std::uint_least64_t>(a)*b)};
                high=ret.high;
                return ret.low;
        }
        else
        {
                std::uint_least64_t v{static_cast<std::uint_least64_t>(a)*b};
                high=static_cast<std::uint_least32_t>(v>>32u);
                return static_cast<std::uint_least32_t>(v);
        }
}
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry_naive(bool carry,T a,T b,T& out) noexcept
{
        T temp{carry+a};
        out=temp+b;
        return (out < b) | (temp < a);
}

template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry(bool carry,T a,T b,T& out) noexcept
{
#if __cpp_lib_is_constant_evaluated >= 201811L
        if(std::is_constant_evaluated())
                return add_carry_naive(carry,a,b,out);
        else
#endif
        {
#if defined(_MSC_VER) && !defined(__clang__)
#if (defined(_M_IX86) || defined(_M_AMD64))
        if constexpr(sizeof(T)==8)
        {
#if defined(_M_AMD64)
                return
_addcarryx_u64(carry,a,b,reinterpret_cast<std::uint64_t*>(__builtin_addressof(out)));
#else
                return _addcarryx_u32(_addcarryx_u32(carry,
               
*reinterpret_cast<std::uint32_t*>(__builtin_addressof(a)),*reinterpret_cast<std::uint32_t*>(__builtin_addressof(b)),reinterpret_cast<std::uint32_t*>(__builtin_addressof(out))),
               
reinterpret_cast<std::uint32_t*>(__builtin_addressof(a))[1],reinterpret_cast<std::uint32_t*>(__builtin_addressof(b))[1],reinterpret_cast<std::uint32_t*>(__builtin_addressof(out))+1);
#endif
        }
        else if constexpr(sizeof(T)==4)
                return
_addcarryx_u32(carry,a,b,reinterpret_cast<std::uint32_t*>(__builtin_addressof(out)));
        else if constexpr(sizeof(T)==2)
                return _addcarry_u16(carry,a,b,reinterpret_cast<short
unsigned*>(__builtin_addressof(out)));
        else if constexpr(sizeof(T)==1)
                return _addcarry_u8(carry,a,b,reinterpret_cast<char
unsigned*>(__builtin_addressof(out)));
        else
                return add_carry_naive(carry,a,b,out);
#else
                return add_carry_naive(carry,a,b,out);
#endif
#elif defined(__has_builtin) &&
(__has_builtin(__builtin_addcb)&&__has_builtin(__builtin_addcs)&&__has_builtin(__builtin_addc)&&__has_builtin(__builtin_addcl)&&__has_builtin(__builtin_addcll))
        if constexpr(sizeof(T)==sizeof(long long unsigned))
        {
                long long unsigned carryout;
                out=__builtin_addcll(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else if constexpr(sizeof(T)==sizeof(long unsigned))
        {
                long unsigned carryout;
                out=__builtin_addcl(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else if constexpr(sizeof(T)==sizeof(unsigned))
        {
                unsigned carryout;
                out=__builtin_addc(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else if constexpr(sizeof(T)==sizeof(short unsigned))
        {
                short unsigned carryout;
                out=__builtin_addcs(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else if constexpr(sizeof(T)==sizeof(char unsigned))
        {
                char unsigned carryout;
                out=__builtin_addcb(a,b,carry,__builtin_addressof(carryout));
                return carryout;
        }
        else
        {
                return add_carry_naive(carry,a,b,out);
        }
#elif defined(__has_builtin) &&
(__has_builtin(__builtin_ia32_addcarryx_u32)||__has_builtin(__builtin_ia32_addcarry_u32)||__has_builtin(__builtin_ia32_addcarryx_u64))
        if constexpr(sizeof(T)==8)
        {
#if __has_builtin(__builtin_ia32_addcarryx_u64)
                using may_alias_ptr_type [[gnu::may_alias]] = unsigned long
long*;
                return
__builtin_ia32_addcarryx_u64(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#else
                std::uint32_t a_low;
                std::uint32_t a_high;
               
__builtin_memcpy(__builtin_addressof(a_low),__builtin_addressof(a),4);
               
__builtin_memcpy(__builtin_addressof(a_high),reinterpret_cast<char
const*>(__builtin_addressof(a))+4,4);
                std::uint32_t b_low;
                std::uint32_t b_high;
               
__builtin_memcpy(__builtin_addressof(b_low),__builtin_addressof(b),4);
               
__builtin_memcpy(__builtin_addressof(b_high),reinterpret_cast<char
const*>(__builtin_addressof(b))+4,4);
                using may_alias_ptr_type [[gnu::may_alias]] = unsigned*;
        #if __has_builtin(__builtin_ia32_addcarry_u32)
                return
__builtin_ia32_addcarry_u32(__builtin_ia32_addcarry_u32(carry,a_low,b_low,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))),
               
a_high,b_high,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))+1);
        #elif __has_builtin(__builtin_ia32_addcarryx_u32)
                return
__builtin_ia32_addcarryx_u32(__builtin_ia32_addcarryx_u32(carry,a_low,b_low,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))),
               
a_high,b_high,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out))+1);
        #else
                return add_carry_naive(carry,a,b,out);
        #endif
#endif
        }
        else if constexpr(sizeof(T)==4)
        {
                using may_alias_ptr_type [[gnu::may_alias]] = unsigned*;
#if __has_builtin(__builtin_ia32_addcarry_u32)
                return
__builtin_ia32_addcarry_u32(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#elif __has_builtin(__builtin_ia32_addcarryx_u32)
                return
__builtin_ia32_addcarryx_u32(carry,a,b,reinterpret_cast<may_alias_ptr_type>(__builtin_addressof(out)));
#else
                return add_carry_naive(carry,a,b,out);
#endif
        }
        else
                return add_carry_naive(carry,a,b,out);  //16 bit addcarry
simply does not exist on gcc and clang
#else
        return add_carry_naive(carry,a,b,out);
#endif
        }
}
std::uint_least64_t umul_least_64(std::uint_least64_t a,std::uint_least64_t
b,std::uint_least64_t& high) noexcept
{
    auto [a0,a1]=__builtin_bit_cast(ul32x2,a);
    auto [b0,b1]=__builtin_bit_cast(ul32x2,b);
    std::uint_least32_t c1;
    std::uint_least32_t c0{umul_least_32(a0,b0,c1)};
    std::uint_least32_t a0b1h;
    std::uint_least32_t a0b1l{umul_least_32(a0,b1,a0b1h)};
    std::uint_least32_t a1b0h;
    std::uint_least32_t a1b0l{umul_least_32(a1,b0,a1b0h)};
    std::uint_least32_t c3;
    std::uint_least32_t c2{umul_least_32(a1,b1,c3)};
    bool carry{add_carry(false,c1,a0b1l,c1)};
    carry=add_carry(carry,a0b1h,c2,c2);
    std::uint_least32_t temp{carry};
    carry=add_carry(false,c1,a1b0l,c1);
    carry=add_carry(carry,a1b0h,c2,c2);
    add_carry(carry,temp,c3,c3);
    high=__builtin_bit_cast(std::uint_least64_t,ul32x2{c2,c3});
    return __builtin_bit_cast(std::uint_least64_t,ul32x2{c0,c1});
}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
  2021-10-27 21:18 ` [Bug tree-optimization/102974] " pinskia at gcc dot gnu.org
@ 2021-10-27 21:19 ` pinskia at gcc dot gnu.org
  2021-10-27 21:22 ` unlvsur at live dot com
                   ` (13 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-10-27 21:19 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
          Component|tree-optimization           |target

--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
There might be another bug about _addcarryx_u64 already.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
  2021-10-27 21:18 ` [Bug tree-optimization/102974] " pinskia at gcc dot gnu.org
  2021-10-27 21:19 ` [Bug target/102974] " pinskia at gcc dot gnu.org
@ 2021-10-27 21:22 ` unlvsur at live dot com
  2021-10-27 21:23 ` unlvsur at live dot com
                   ` (12 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 21:22 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #3 from cqwrteur <unlvsur at live dot com> ---
(In reply to Andrew Pinski from comment #2)
> There might be another bug about _addcarryx_u64 already.

This is 32 bit addcarry.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (2 preceding siblings ...)
  2021-10-27 21:22 ` unlvsur at live dot com
@ 2021-10-27 21:23 ` unlvsur at live dot com
  2021-10-27 21:24 ` pinskia at gcc dot gnu.org
                   ` (11 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 21:23 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #4 from cqwrteur <unlvsur at live dot com> ---
(In reply to cqwrteur from comment #3)
> (In reply to Andrew Pinski from comment #2)
> > There might be another bug about _addcarryx_u64 already.
> 
> This is 32 bit addcarry.

but yeah. GCC does not perform optimizations very well to add carries and mul +
recognize >>64u <<64u patterns

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (3 preceding siblings ...)
  2021-10-27 21:23 ` unlvsur at live dot com
@ 2021-10-27 21:24 ` pinskia at gcc dot gnu.org
  2021-10-27 22:39 ` unlvsur at live dot com
                   ` (10 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-10-27 21:24 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #5 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to cqwrteur from comment #4)
> (In reply to cqwrteur from comment #3)
> > (In reply to Andrew Pinski from comment #2)
> > > There might be another bug about _addcarryx_u64 already.
> > 
> > This is 32 bit addcarry.
> 
> but yeah. GCC does not perform optimizations very well to add carries and
> mul + recognize >>64u <<64u patterns

I mean all of _addcarryx_* intrinsics.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (4 preceding siblings ...)
  2021-10-27 21:24 ` pinskia at gcc dot gnu.org
@ 2021-10-27 22:39 ` unlvsur at live dot com
  2021-10-27 23:02 ` unlvsur at live dot com
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 22:39 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #6 from cqwrteur <unlvsur at live dot com> ---
(In reply to Andrew Pinski from comment #5)
> (In reply to cqwrteur from comment #4)
> > (In reply to cqwrteur from comment #3)
> > > (In reply to Andrew Pinski from comment #2)
> > > > There might be another bug about _addcarryx_u64 already.
> > > 
> > > This is 32 bit addcarry.
> > 
> > but yeah. GCC does not perform optimizations very well to add carries and
> > mul + recognize >>64u <<64u patterns
> 
> I mean all of _addcarryx_* intrinsics.

https://godbolt.org/z/qq3nb49Eq
https://godbolt.org/z/cqoYG35jx
Also this is weird. just extract part of code into function generates different
assembly for __builtin_bit_cast. It must be a inliner bug.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (5 preceding siblings ...)
  2021-10-27 22:39 ` unlvsur at live dot com
@ 2021-10-27 23:02 ` unlvsur at live dot com
  2021-10-28  8:20 ` rguenth at gcc dot gnu.org
                   ` (8 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-27 23:02 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #7 from cqwrteur <unlvsur at live dot com> ---
(In reply to cqwrteur from comment #6)
> (In reply to Andrew Pinski from comment #5)
> > (In reply to cqwrteur from comment #4)
> > > (In reply to cqwrteur from comment #3)
> > > > (In reply to Andrew Pinski from comment #2)
> > > > > There might be another bug about _addcarryx_u64 already.
> > > > 
> > > > This is 32 bit addcarry.
> > > 
> > > but yeah. GCC does not perform optimizations very well to add carries and
> > > mul + recognize >>64u <<64u patterns
> > 
> > I mean all of _addcarryx_* intrinsics.
> 
> https://godbolt.org/z/qq3nb49Eq
> https://godbolt.org/z/cqoYG35jx
> Also this is weird. just extract part of code into function generates
> different assembly for __builtin_bit_cast. It must be a inliner bug.

my fault for misreading(In reply to Andrew Pinski from comment #5)
> (In reply to cqwrteur from comment #4)
> > (In reply to cqwrteur from comment #3)
> > > (In reply to Andrew Pinski from comment #2)
> > > > There might be another bug about _addcarryx_u64 already.
> > > 
> > > This is 32 bit addcarry.
> > 
> > but yeah. GCC does not perform optimizations very well to add carries and
> > mul + recognize >>64u <<64u patterns
> 
> I mean all of _addcarryx_* intrinsics.

This example is also interesting that -O2, -O3, -Ofast generates much worse
assembly than -O1. There is no point for doing SIMD for things like this

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (6 preceding siblings ...)
  2021-10-27 23:02 ` unlvsur at live dot com
@ 2021-10-28  8:20 ` rguenth at gcc dot gnu.org
  2021-10-28  9:56 ` unlvsur at live dot com
                   ` (7 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-10-28  8:20 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #8 from Richard Biener <rguenth at gcc dot gnu.org> ---
As mentioned in the other bug STV might make things only worse.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (7 preceding siblings ...)
  2021-10-28  8:20 ` rguenth at gcc dot gnu.org
@ 2021-10-28  9:56 ` unlvsur at live dot com
  2021-11-01 10:07 ` marxin at gcc dot gnu.org
                   ` (6 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2021-10-28  9:56 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #9 from cqwrteur <unlvsur at live dot com> ---
(In reply to Richard Biener from comment #8)
> As mentioned in the other bug STV might make things only worse.

what is stv?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (8 preceding siblings ...)
  2021-10-28  9:56 ` unlvsur at live dot com
@ 2021-11-01 10:07 ` marxin at gcc dot gnu.org
  2023-06-03 23:53 ` slash.tmp at free dot fr
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: marxin at gcc dot gnu.org @ 2021-11-01 10:07 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

Martin Liška <marxin at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |marxin at gcc dot gnu.org

--- Comment #10 from Martin Liška <marxin at gcc dot gnu.org> ---
(In reply to cqwrteur from comment #9)
> (In reply to Richard Biener from comment #8)
> > As mentioned in the other bug STV might make things only worse.
> 
> what is stv?

$ gcc --help=target | grep mstv
  -mstv                       Disable Scalar to Vector optimization pass
transforming 64-bit integer computations into a vector ones.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (9 preceding siblings ...)
  2021-11-01 10:07 ` marxin at gcc dot gnu.org
@ 2023-06-03 23:53 ` slash.tmp at free dot fr
  2023-06-06  6:56 ` slash.tmp at free dot fr
                   ` (4 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: slash.tmp at free dot fr @ 2023-06-03 23:53 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #11 from Mason <slash.tmp at free dot fr> ---
Here's umul_least_64() rewritten as mul_64x64_128() in C

typedef unsigned int u32;
typedef unsigned long long u64;

/* u32 acc[3], a[1], b[1] */
static void mul_add_32x32(u32 *acc, const u32 *a, const u32 *b)
{
  u64 res = (u64)a[0] * b[0];
  u32 lo = res, hi = res >> 32;
  asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]\n\t" "adc $0, %[D2]" :
  [D0] "+m" (acc[0]), [D1] "+m" (acc[1]), [D2] "+m" (acc[2]) :
  [LO] "r" (lo), [HI] "r" (hi) : "cc");
}

/* u32 acc[5], a[2], b[2] */
void mul_64x64_128(u32 *acc, const u32 *a, const u32 *b)
{
  mul_add_32x32(acc+0, a+0, b+0);
  mul_add_32x32(acc+1, a+0, b+1);
  mul_add_32x32(acc+1, a+1, b+0);
  mul_add_32x32(acc+2, a+1, b+1);
}

gcc-trunk -O3 -m32

mul_64x64_128:
  pushl %esi
  pushl %ebx
  movl 16(%esp), %ebx   ; ebx = a
  movl 20(%esp), %esi   ; esi = b
  movl 12(%esp), %ecx   ; ecx = acc
  movl (%esi), %eax     ; b0
  mull (%ebx)           ; a0*b0
  add %eax, (%ecx)
  adc %edx, 4(%ecx)
  adc $0, 8(%ecx)
  movl 4(%esi), %eax    ; b1
  mull (%ebx)           ; a0*b1
  add %eax, 4(%ecx)
  adc %edx, 8(%ecx)
  adc $0, 12(%ecx)
  movl (%esi), %eax     ; b0
  mull 4(%ebx)          ; a1*b0
  add %eax, 4(%ecx)
  adc %edx, 8(%ecx)
  adc $0, 12(%ecx)
  movl 4(%esi), %eax    ; b1
  mull 4(%ebx)          ; a1*b1
  add %eax, 8(%ecx)
  adc %edx, 12(%ecx)
  adc $0, 16(%ecx)
  popl %ebx
  popl %esi
  ret

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (10 preceding siblings ...)
  2023-06-03 23:53 ` slash.tmp at free dot fr
@ 2023-06-06  6:56 ` slash.tmp at free dot fr
  2023-06-06  7:01 ` unlvsur at live dot com
                   ` (3 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: slash.tmp at free dot fr @ 2023-06-06  6:56 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #12 from Mason <slash.tmp at free dot fr> ---
Actually, in this case, we don't need to propagate the carry over 3 limbs.

typedef unsigned int u32;
typedef unsigned long long u64;

/* u32 acc[2], a[1], b[1] */
static void mul_add_32x32(u32 *acc, const u32 *a, const u32 *b)
{
  u64 res = (u64)a[0] * b[0];
  u32 lo = res, hi = res >> 32;
  asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]" :
  [D0] "+m" (acc[0]), [D1] "+m" (acc[1]) :
  [LO] "r" (lo), [HI] "r" (hi) : "cc");
}

/* u32 acc[4], a[2], b[2] */
void mul_64x64_128(u32 *acc, const u32 *a, const u32 *b)
{
  mul_add_32x32(acc+0, a+0, b+0);
  mul_add_32x32(acc+1, a+0, b+1);
  mul_add_32x32(acc+1, a+1, b+0);
  mul_add_32x32(acc+2, a+1, b+1);
}

gcc-trunk -O3 -m32

mul_64x64_128:
  pushl %esi
  pushl %ebx
  movl 16(%esp), %ebx
  movl 20(%esp), %esi
  movl 12(%esp), %ecx
  movl (%esi), %eax
  mull (%ebx)
  add %eax, (%ecx)
  adc %edx, 4(%ecx)
  movl 4(%esi), %eax
  mull (%ebx)
  add %eax, 4(%ecx)
  adc %edx, 8(%ecx)
  movl (%esi), %eax
  mull 4(%ebx)
  add %eax, 4(%ecx)
  adc %edx, 8(%ecx)
  movl 4(%esi), %eax
  mull 4(%ebx)
  add %eax, 8(%ecx)
  adc %edx, 12(%ecx)
  popl %ebx
  popl %esi
  ret

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (11 preceding siblings ...)
  2023-06-06  6:56 ` slash.tmp at free dot fr
@ 2023-06-06  7:01 ` unlvsur at live dot com
  2023-06-06  7:02 ` unlvsur at live dot com
                   ` (2 subsequent siblings)
  15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2023-06-06  7:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #13 from cqwrteur <unlvsur at live dot com> ---
Hi, the problem comes out GCC does not do a very good job to deal with crypto
computations that usually exploit all sorts of patterns.


template<typename T>
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
    T res{a+b};
    carryout=res<a;
    return res;
}

template<typename T>
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
    assume(carryout==0||carryout==1);
    a=add_carry_no_carry_in(carryin,a,carryout);
    a=add_carry_no_carry_in(a,b,carryin);
    carryout+=carryin;
    return a;
}

See this pattern
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106865

I suggest just adding this addc pattern to GCC instead of adding a built-in
like clang. This can improve the existing code. It is, however, needed for
adding a backend hook for dealing with it.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (12 preceding siblings ...)
  2023-06-06  7:01 ` unlvsur at live dot com
@ 2023-06-06  7:02 ` unlvsur at live dot com
  2023-06-06  7:05 ` unlvsur at live dot com
  2023-06-06 17:19 ` slash.tmp at free dot fr
  15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2023-06-06  7:02 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #14 from cqwrteur <unlvsur at live dot com> ---
template<typename T>
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
    T res{a+b};
    carryout=res<a;
    return res;
}

template<typename T>
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
    assume(carryin==0||carryin==1);
    a=add_carry_no_carry_in(carryin,a,carryout);
    a=add_carry_no_carry_in(a,b,carryin);
    carryout+=carryin;
    assume(carryout==0||carryout==1);
    return a;
}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (13 preceding siblings ...)
  2023-06-06  7:02 ` unlvsur at live dot com
@ 2023-06-06  7:05 ` unlvsur at live dot com
  2023-06-06 17:19 ` slash.tmp at free dot fr
  15 siblings, 0 replies; 17+ messages in thread
From: unlvsur at live dot com @ 2023-06-06  7:05 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #15 from cqwrteur <unlvsur at live dot com> ---
template<::std::unsigned_integral T>
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
    T res{a+b};
    carryout=res<a;
    return res;
}

template<::std::unsigned_integral T>
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
    assume(carryin==0||carryin==1);
    a=add_carry_no_carry_in(carryin,a,carryout);
    a=add_carry_no_carry_in(a,b,carryin);
    carryout+=carryin;
    assume(carryout==0||carryout==1);
    return a;
}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Bug target/102974] GCC optimization is very poor for add carry and multiplication combos
  2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
                   ` (14 preceding siblings ...)
  2023-06-06  7:05 ` unlvsur at live dot com
@ 2023-06-06 17:19 ` slash.tmp at free dot fr
  15 siblings, 0 replies; 17+ messages in thread
From: slash.tmp at free dot fr @ 2023-06-06 17:19 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #16 from Mason <slash.tmp at free dot fr> ---
For the record, the example I provided was intended to show that, with some
help, GCC can generate good code for bigint multiplication. In this situation,
"help" means a short asm template.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2023-06-06 17:19 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-27 21:17 [Bug tree-optimization/102974] New: GCC optimization is very poor for add carry and multiplication combos unlvsur at live dot com
2021-10-27 21:18 ` [Bug tree-optimization/102974] " pinskia at gcc dot gnu.org
2021-10-27 21:19 ` [Bug target/102974] " pinskia at gcc dot gnu.org
2021-10-27 21:22 ` unlvsur at live dot com
2021-10-27 21:23 ` unlvsur at live dot com
2021-10-27 21:24 ` pinskia at gcc dot gnu.org
2021-10-27 22:39 ` unlvsur at live dot com
2021-10-27 23:02 ` unlvsur at live dot com
2021-10-28  8:20 ` rguenth at gcc dot gnu.org
2021-10-28  9:56 ` unlvsur at live dot com
2021-11-01 10:07 ` marxin at gcc dot gnu.org
2023-06-03 23:53 ` slash.tmp at free dot fr
2023-06-06  6:56 ` slash.tmp at free dot fr
2023-06-06  7:01 ` unlvsur at live dot com
2023-06-06  7:02 ` unlvsur at live dot com
2023-06-06  7:05 ` unlvsur at live dot com
2023-06-06 17:19 ` slash.tmp at free dot fr

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).