* loops and optimizations
@ 2019-09-02 7:19 Gero Peterhoff
2019-09-02 10:14 ` Richard Biener
0 siblings, 1 reply; 2+ messages in thread
From: Gero Peterhoff @ 2019-09-02 7:19 UTC (permalink / raw)
To: gcc
Hello gcc team,
I have a big problem with code generation for conditions and loop control and in general.
1) Once a function is defined as inline, all attributes and pragmas are ignored. There is therefore no control at all for loops within inline functions.
2) Only (almost) always conditional jumps are generated:
- even mini-functions do not produce cmov
- the jump control itself is completely inefficient and therefore slow, since conditional jumps are not summarized
- The functionality is always moved to the end of a function and must return to the exit point.
All this means that a lot of codebloats are generated. It jumps around in confusion. This kill any jump prediction.
An Example for swap bits in a integer:
#define bitsof(Arg) std::size_t(sizeof(Arg) * CHAR_BIT)
using numeric_type = std::uint8_t;
template <typename Type, typename Bits>
constexpr bool is_bits(const Type&, const Bits& bits) noexcept
{
return bits < bitsof(Type);
}
template <bool Numeric, typename... Args>
constexpr bool all_of(const Args... args) noexcept
{
static_assert
(
sizeof...(Args) <= std::size_t{std::numeric_limits<numeric_type>::max()} &&
sizeof...(Args) > 0,
"invalid argument count"
);
if constexpr (Numeric)
return (numeric_type(bool(args)) + ...) == numeric_type(sizeof...(Args));
else
return (bool(args) && ...);
}
template <size_t Test, typename Type, typename Bit>
constexpr Type swap(const Type& arg, const Bit& bit1, const Bit& bit2) noexcept
{
// Test 0 = default
// Test 1 = fold
// Test 2 = numeric
using unsigned_type = std::make_unsigned_t<Type>;
auto work = [&]() constexpr noexcept -> Type
{
unsigned_type
res = arg;
res = ((res>>bit1) ^ (res>>bit2)) & unsigned_type{1};
res = (res<<bit1) | (res<<bit2);
res ^= arg;
return res;
};
if constexpr (Test == 2)
return all_of<true>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
else if constexpr (Test == 1)
return all_of<false>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
else
return (bit1!=bit2 && is_bits(arg, bit1) && is_bits(arg, bit2)) ? work() : arg;
}
generates:
1) default:
2 jumps to different targets: 20->28, 26->30
return back 4e->28
0000000000000000 <unsigned int silent::bits::swap<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
0: 49 89 d0 mov %rdx,%r8
3: 0f b6 16 movzbl (%rsi),%edx
6: 41 0f b6 08 movzbl (%r8),%ecx
a: 44 8b 07 mov (%rdi),%r8d
d: 80 fa 1f cmp $0x1f,%dl
10: 0f 96 c0 setbe %al
13: 38 d1 cmp %dl,%cl
15: 0f b6 f9 movzbl %cl,%edi
18: 40 0f 95 c6 setne %sil
1c: 21 f0 and %esi,%eax
1e: a8 01 test $0x1,%al
20: 74 06 je 28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
22: 66 83 ff 1f cmp $0x1f,%di
26: 76 08 jbe 30 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x30>
28: 44 89 c0 mov %r8d,%eax
2b: c3 retq
2c: 0f 1f 40 00 nopl 0x0(%rax)
30: c4 c2 6b f7 c0 shrx %edx,%r8d,%eax
35: c4 c2 73 f7 f0 shrx %ecx,%r8d,%esi
3a: 31 f0 xor %esi,%eax
3c: 83 e0 01 and $0x1,%eax
3f: c4 e2 69 f7 d0 shlx %edx,%eax,%edx
44: c4 e2 71 f7 c8 shlx %ecx,%eax,%ecx
49: 09 ca or %ecx,%edx
4b: 41 31 d0 xor %edx,%r8d
4e: eb d8 jmp 28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
2) fold-expr:
3 jumps with one target: 11,17,1d->3d
~25% faster than 1)
0000000000000000 <unsigned int silent::bits::swap<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
0: 0f b6 12 movzbl (%rdx),%edx
3: 0f b6 0e movzbl (%rsi),%ecx
6: 44 8b 07 mov (%rdi),%r8d
9: 0f b6 f2 movzbl %dl,%esi
c: 89 c8 mov %ecx,%eax
e: 66 39 ce cmp %cx,%si
11: 74 2a je 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
13: 66 83 f9 1f cmp $0x1f,%cx
17: 77 24 ja 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
19: 66 83 fe 1f cmp $0x1f,%si
1d: 77 1e ja 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
1f: c4 c2 7b f7 c8 shrx %eax,%r8d,%ecx
24: c4 c2 6b f7 f0 shrx %edx,%r8d,%esi
29: 31 f1 xor %esi,%ecx
2b: 83 e1 01 and $0x1,%ecx
2e: c4 e2 79 f7 c1 shlx %eax,%ecx,%eax
33: c4 e2 69 f7 d1 shlx %edx,%ecx,%edx
38: 09 d0 or %edx,%eax
3a: 41 31 c0 xor %eax,%r8d
3d: 44 89 c0 mov %r8d,%eax
40: c3 retq
3) numeric
calculate condiditions by numeric:
1 jump to 1 target: 29->49
~50% faster than 1)
0000000000000000 <unsigned int silent::bits::swap<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
0: 0f b6 0a movzbl (%rdx),%ecx
3: 0f b6 16 movzbl (%rsi),%edx
6: 31 c0 xor %eax,%eax
8: 44 8b 07 mov (%rdi),%r8d
b: 80 f9 1f cmp $0x1f,%cl
e: 0f 96 c0 setbe %al
11: 31 f6 xor %esi,%esi
13: 80 fa 1f cmp $0x1f,%dl
16: 40 0f 96 c6 setbe %sil
1a: 01 f0 add %esi,%eax
1c: 31 f6 xor %esi,%esi
1e: 38 ca cmp %cl,%dl
20: 40 0f 95 c6 setne %sil
24: 01 f0 add %esi,%eax
26: 83 f8 03 cmp $0x3,%eax
29: 75 1e jne 49 <unsigned int silent::bits::swapx<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x49>
2b: c4 c2 6b f7 c0 shrx %edx,%r8d,%eax
30: c4 c2 73 f7 f0 shrx %ecx,%r8d,%esi
35: 31 f0 xor %esi,%eax
37: 83 e0 01 and $0x1,%eax
3a: c4 e2 69 f7 d0 shlx %edx,%eax,%edx
3f: c4 e2 71 f7 c8 shlx %ecx,%eax,%ecx
44: 09 ca or %ecx,%edx
46: 41 31 d0 xor %edx,%r8d
49: 44 89 c0 mov %r8d,%eax
4c: c3 retq
but clang can more tricky with adc:
(code by godbolt)
mov rax, rdi
xor ecx, ecx
cmp sil, dl
setne cl
cmp sil, 64
adc ecx, 0
cmp dl, 64
adc ecx, 0
cmp ecx, 3
jne .LBB0_2
shrx rcx, rax, rsi
shrx rdi, rax, rdx
xor edi, ecx
and edi, 1
shlx rcx, rdi, rsi
shlx rdx, rdi, rdx
or rdx, rcx
xor rax, rdx
.LBB0_2:
ret
I do not understand why you still insist on outdated function attributes, rather than on optimization/loop control within a function.
best regards
Gero
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: loops and optimizations
2019-09-02 7:19 loops and optimizations Gero Peterhoff
@ 2019-09-02 10:14 ` Richard Biener
0 siblings, 0 replies; 2+ messages in thread
From: Richard Biener @ 2019-09-02 10:14 UTC (permalink / raw)
To: Gero Peterhoff; +Cc: GCC Development
On Mon, Sep 2, 2019 at 9:19 AM Gero Peterhoff <g.peterhoff@t-online.de> wrote:
>
> Hello gcc team,
> I have a big problem with code generation for conditions and loop control and in general.
>
> 1) Once a function is defined as inline, all attributes and pragmas are ignored. There is therefore no control at all for loops within inline functions.
> 2) Only (almost) always conditional jumps are generated:
> - even mini-functions do not produce cmov
> - the jump control itself is completely inefficient and therefore slow, since conditional jumps are not summarized
> - The functionality is always moved to the end of a function and must return to the exit point.
> All this means that a lot of codebloats are generated. It jumps around in confusion. This kill any jump prediction.
> An Example for swap bits in a integer:
>
> #define bitsof(Arg) std::size_t(sizeof(Arg) * CHAR_BIT)
> using numeric_type = std::uint8_t;
>
> template <typename Type, typename Bits>
> constexpr bool is_bits(const Type&, const Bits& bits) noexcept
> {
> return bits < bitsof(Type);
> }
>
> template <bool Numeric, typename... Args>
> constexpr bool all_of(const Args... args) noexcept
> {
> static_assert
> (
> sizeof...(Args) <= std::size_t{std::numeric_limits<numeric_type>::max()} &&
> sizeof...(Args) > 0,
> "invalid argument count"
> );
>
> if constexpr (Numeric)
> return (numeric_type(bool(args)) + ...) == numeric_type(sizeof...(Args));
> else
> return (bool(args) && ...);
> }
>
> template <size_t Test, typename Type, typename Bit>
> constexpr Type swap(const Type& arg, const Bit& bit1, const Bit& bit2) noexcept
> {
> // Test 0 = default
> // Test 1 = fold
> // Test 2 = numeric
>
> using unsigned_type = std::make_unsigned_t<Type>;
>
> auto work = [&]() constexpr noexcept -> Type
> {
> unsigned_type
> res = arg;
>
> res = ((res>>bit1) ^ (res>>bit2)) & unsigned_type{1};
> res = (res<<bit1) | (res<<bit2);
> res ^= arg;
> return res;
> };
>
> if constexpr (Test == 2)
> return all_of<true>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
> else if constexpr (Test == 1)
> return all_of<false>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
> else
> return (bit1!=bit2 && is_bits(arg, bit1) && is_bits(arg, bit2)) ? work() : arg;
> }
Couldn't get it to compile even with some fiddling. Pleas always
provide testcases
that can be compiled without editing.
> generates:
>
> 1) default:
> 2 jumps to different targets: 20->28, 26->30
> return back 4e->28
>
> 0000000000000000 <unsigned int silent::bits::swap<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
> 0: 49 89 d0 mov %rdx,%r8
> 3: 0f b6 16 movzbl (%rsi),%edx
> 6: 41 0f b6 08 movzbl (%r8),%ecx
> a: 44 8b 07 mov (%rdi),%r8d
> d: 80 fa 1f cmp $0x1f,%dl
> 10: 0f 96 c0 setbe %al
> 13: 38 d1 cmp %dl,%cl
> 15: 0f b6 f9 movzbl %cl,%edi
> 18: 40 0f 95 c6 setne %sil
> 1c: 21 f0 and %esi,%eax
> 1e: a8 01 test $0x1,%al
> 20: 74 06 je 28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
> 22: 66 83 ff 1f cmp $0x1f,%di
> 26: 76 08 jbe 30 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x30>
> 28: 44 89 c0 mov %r8d,%eax
> 2b: c3 retq
> 2c: 0f 1f 40 00 nopl 0x0(%rax)
> 30: c4 c2 6b f7 c0 shrx %edx,%r8d,%eax
> 35: c4 c2 73 f7 f0 shrx %ecx,%r8d,%esi
> 3a: 31 f0 xor %esi,%eax
> 3c: 83 e0 01 and $0x1,%eax
> 3f: c4 e2 69 f7 d0 shlx %edx,%eax,%edx
> 44: c4 e2 71 f7 c8 shlx %ecx,%eax,%ecx
> 49: 09 ca or %ecx,%edx
> 4b: 41 31 d0 xor %edx,%r8d
> 4e: eb d8 jmp 28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
>
>
> 2) fold-expr:
> 3 jumps with one target: 11,17,1d->3d
> ~25% faster than 1)
>
> 0000000000000000 <unsigned int silent::bits::swap<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
> 0: 0f b6 12 movzbl (%rdx),%edx
> 3: 0f b6 0e movzbl (%rsi),%ecx
> 6: 44 8b 07 mov (%rdi),%r8d
> 9: 0f b6 f2 movzbl %dl,%esi
> c: 89 c8 mov %ecx,%eax
> e: 66 39 ce cmp %cx,%si
> 11: 74 2a je 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
> 13: 66 83 f9 1f cmp $0x1f,%cx
> 17: 77 24 ja 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
> 19: 66 83 fe 1f cmp $0x1f,%si
> 1d: 77 1e ja 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
> 1f: c4 c2 7b f7 c8 shrx %eax,%r8d,%ecx
> 24: c4 c2 6b f7 f0 shrx %edx,%r8d,%esi
> 29: 31 f1 xor %esi,%ecx
> 2b: 83 e1 01 and $0x1,%ecx
> 2e: c4 e2 79 f7 c1 shlx %eax,%ecx,%eax
> 33: c4 e2 69 f7 d1 shlx %edx,%ecx,%edx
> 38: 09 d0 or %edx,%eax
> 3a: 41 31 c0 xor %eax,%r8d
> 3d: 44 89 c0 mov %r8d,%eax
> 40: c3 retq
>
>
> 3) numeric
> calculate condiditions by numeric:
> 1 jump to 1 target: 29->49
> ~50% faster than 1)
>
> 0000000000000000 <unsigned int silent::bits::swap<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
> 0: 0f b6 0a movzbl (%rdx),%ecx
> 3: 0f b6 16 movzbl (%rsi),%edx
> 6: 31 c0 xor %eax,%eax
> 8: 44 8b 07 mov (%rdi),%r8d
> b: 80 f9 1f cmp $0x1f,%cl
> e: 0f 96 c0 setbe %al
> 11: 31 f6 xor %esi,%esi
> 13: 80 fa 1f cmp $0x1f,%dl
> 16: 40 0f 96 c6 setbe %sil
> 1a: 01 f0 add %esi,%eax
> 1c: 31 f6 xor %esi,%esi
> 1e: 38 ca cmp %cl,%dl
> 20: 40 0f 95 c6 setne %sil
> 24: 01 f0 add %esi,%eax
> 26: 83 f8 03 cmp $0x3,%eax
> 29: 75 1e jne 49 <unsigned int silent::bits::swapx<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x49>
> 2b: c4 c2 6b f7 c0 shrx %edx,%r8d,%eax
> 30: c4 c2 73 f7 f0 shrx %ecx,%r8d,%esi
> 35: 31 f0 xor %esi,%eax
> 37: 83 e0 01 and $0x1,%eax
> 3a: c4 e2 69 f7 d0 shlx %edx,%eax,%edx
> 3f: c4 e2 71 f7 c8 shlx %ecx,%eax,%ecx
> 44: 09 ca or %ecx,%edx
> 46: 41 31 d0 xor %edx,%r8d
> 49: 44 89 c0 mov %r8d,%eax
> 4c: c3 retq
>
>
> but clang can more tricky with adc:
> (code by godbolt)
>
> mov rax, rdi
> xor ecx, ecx
> cmp sil, dl
> setne cl
> cmp sil, 64
> adc ecx, 0
> cmp dl, 64
> adc ecx, 0
> cmp ecx, 3
> jne .LBB0_2
> shrx rcx, rax, rsi
> shrx rdi, rax, rdx
> xor edi, ecx
> and edi, 1
> shlx rcx, rdi, rsi
> shlx rdx, rdi, rdx
> or rdx, rcx
> xor rax, rdx
> .LBB0_2:
> ret
>
> I do not understand why you still insist on outdated function attributes, rather than on optimization/loop control within a function.
>
> best regards
> Gero
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2019-09-02 10:14 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-02 7:19 loops and optimizations Gero Peterhoff
2019-09-02 10:14 ` Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).