public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* loops and optimizations
@ 2019-09-02  7:19 Gero Peterhoff
  2019-09-02 10:14 ` Richard Biener
  0 siblings, 1 reply; 2+ messages in thread
From: Gero Peterhoff @ 2019-09-02  7:19 UTC (permalink / raw)
  To: gcc

Hello gcc team,
I have a big problem with code generation for conditions and loop control and in general.

1) Once a function is defined as inline, all attributes and pragmas are ignored. There is therefore no control at all for loops within inline functions.
2) Only (almost) always conditional jumps are generated:
- even mini-functions do not produce cmov
- the jump control itself is completely inefficient and therefore slow, since conditional jumps are not summarized
- The functionality is always moved to the end of a function and must return to the exit point.
All this means that a lot of codebloats are generated. It jumps around in confusion. This kill any jump prediction.
An Example for swap bits in a integer:

#define	bitsof(Arg)	std::size_t(sizeof(Arg) * CHAR_BIT)
using numeric_type	= std::uint8_t;

template <typename Type, typename Bits>
constexpr bool	is_bits(const Type&, const Bits& bits)	noexcept
{
	return bits < bitsof(Type);
}

template <bool Numeric, typename... Args>
constexpr bool	all_of(const Args... args)	noexcept
{
	static_assert
	(
		sizeof...(Args) <= std::size_t{std::numeric_limits<numeric_type>::max()}	&&
		sizeof...(Args) > 0,
		"invalid argument count"
	);

	if constexpr (Numeric)
		return (numeric_type(bool(args)) + ...) == numeric_type(sizeof...(Args));
	else
		return (bool(args) && ...);
}

template <size_t Test, typename Type, typename Bit>
constexpr Type	swap(const Type& arg, const Bit& bit1, const Bit& bit2)	noexcept
{
	//	Test 0 = default
	//	Test 1 = fold
	//	Test 2 = numeric

	using unsigned_type = std::make_unsigned_t<Type>;

	auto work = [&]() constexpr noexcept -> Type
	{
		unsigned_type
			res = arg;

		res = ((res>>bit1) ^ (res>>bit2)) & unsigned_type{1};
		res = (res<<bit1) | (res<<bit2);
		res ^= arg;
		return res;
	};

	if constexpr (Test == 2)
		return all_of<true>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
	else if constexpr (Test == 1)
		return all_of<false>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
	else
		return (bit1!=bit2 && is_bits(arg, bit1) && is_bits(arg, bit2)) ? work() : arg;
}

generates:

1) default:
2 jumps to different targets: 20->28, 26->30
return back 4e->28

0000000000000000 <unsigned int silent::bits::swap<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
    0:	49 89 d0             	mov    %rdx,%r8
    3:	0f b6 16             	movzbl (%rsi),%edx
    6:	41 0f b6 08          	movzbl (%r8),%ecx
    a:	44 8b 07             	mov    (%rdi),%r8d
    d:	80 fa 1f             	cmp    $0x1f,%dl
   10:	0f 96 c0             	setbe  %al
   13:	38 d1                	cmp    %dl,%cl
   15:	0f b6 f9             	movzbl %cl,%edi
   18:	40 0f 95 c6          	setne  %sil
   1c:	21 f0                	and    %esi,%eax
   1e:	a8 01                	test   $0x1,%al
   20:	74 06                	je     28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
   22:	66 83 ff 1f          	cmp    $0x1f,%di
   26:	76 08                	jbe    30 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x30>
   28:	44 89 c0             	mov    %r8d,%eax
   2b:	c3                   	retq
   2c:	0f 1f 40 00          	nopl   0x0(%rax)
   30:	c4 c2 6b f7 c0       	shrx   %edx,%r8d,%eax
   35:	c4 c2 73 f7 f0       	shrx   %ecx,%r8d,%esi
   3a:	31 f0                	xor    %esi,%eax
   3c:	83 e0 01             	and    $0x1,%eax
   3f:	c4 e2 69 f7 d0       	shlx   %edx,%eax,%edx
   44:	c4 e2 71 f7 c8       	shlx   %ecx,%eax,%ecx
   49:	09 ca                	or     %ecx,%edx
   4b:	41 31 d0             	xor    %edx,%r8d
   4e:	eb d8                	jmp    28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>


2) fold-expr:
3 jumps with one target: 11,17,1d->3d
~25% faster than 1)

0000000000000000 <unsigned int silent::bits::swap<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
    0:	0f b6 12             	movzbl (%rdx),%edx
    3:	0f b6 0e             	movzbl (%rsi),%ecx
    6:	44 8b 07             	mov    (%rdi),%r8d
    9:	0f b6 f2             	movzbl %dl,%esi
    c:	89 c8                	mov    %ecx,%eax
    e:	66 39 ce             	cmp    %cx,%si
   11:	74 2a                	je     3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
   13:	66 83 f9 1f          	cmp    $0x1f,%cx
   17:	77 24                	ja     3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
   19:	66 83 fe 1f          	cmp    $0x1f,%si
   1d:	77 1e                	ja     3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
   1f:	c4 c2 7b f7 c8       	shrx   %eax,%r8d,%ecx
   24:	c4 c2 6b f7 f0       	shrx   %edx,%r8d,%esi
   29:	31 f1                	xor    %esi,%ecx
   2b:	83 e1 01             	and    $0x1,%ecx
   2e:	c4 e2 79 f7 c1       	shlx   %eax,%ecx,%eax
   33:	c4 e2 69 f7 d1       	shlx   %edx,%ecx,%edx
   38:	09 d0                	or     %edx,%eax
   3a:	41 31 c0             	xor    %eax,%r8d
   3d:	44 89 c0             	mov    %r8d,%eax
   40:	c3                   	retq


3) numeric
calculate condiditions by numeric:
1 jump to 1 target: 29->49
~50% faster than 1)

0000000000000000 <unsigned int silent::bits::swap<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
    0:	0f b6 0a             	movzbl (%rdx),%ecx
    3:	0f b6 16             	movzbl (%rsi),%edx
    6:	31 c0                	xor    %eax,%eax
    8:	44 8b 07             	mov    (%rdi),%r8d
    b:	80 f9 1f             	cmp    $0x1f,%cl
    e:	0f 96 c0             	setbe  %al
   11:	31 f6                	xor    %esi,%esi
   13:	80 fa 1f             	cmp    $0x1f,%dl
   16:	40 0f 96 c6          	setbe  %sil
   1a:	01 f0                	add    %esi,%eax
   1c:	31 f6                	xor    %esi,%esi
   1e:	38 ca                	cmp    %cl,%dl
   20:	40 0f 95 c6          	setne  %sil
   24:	01 f0                	add    %esi,%eax
   26:	83 f8 03             	cmp    $0x3,%eax
   29:	75 1e                	jne    49 <unsigned int silent::bits::swapx<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x49>
   2b:	c4 c2 6b f7 c0       	shrx   %edx,%r8d,%eax
   30:	c4 c2 73 f7 f0       	shrx   %ecx,%r8d,%esi
   35:	31 f0                	xor    %esi,%eax
   37:	83 e0 01             	and    $0x1,%eax
   3a:	c4 e2 69 f7 d0       	shlx   %edx,%eax,%edx
   3f:	c4 e2 71 f7 c8       	shlx   %ecx,%eax,%ecx
   44:	09 ca                	or     %ecx,%edx
   46:	41 31 d0             	xor    %edx,%r8d
   49:	44 89 c0             	mov    %r8d,%eax
   4c:	c3                   	retq


but clang can more tricky with adc:
(code by godbolt)

	mov     rax, rdi
	xor     ecx, ecx
         cmp     sil, dl
         setne   cl
         cmp     sil, 64
         adc     ecx, 0
         cmp     dl, 64
         adc     ecx, 0
         cmp     ecx, 3
         jne     .LBB0_2
         shrx    rcx, rax, rsi
         shrx    rdi, rax, rdx
         xor     edi, ecx
         and     edi, 1
         shlx    rcx, rdi, rsi
         shlx    rdx, rdi, rdx
         or      rdx, rcx
         xor     rax, rdx
.LBB0_2:
         ret

I do not understand why you still insist on outdated function attributes, rather than on optimization/loop control within a function.

best regards
Gero

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: loops and optimizations
  2019-09-02  7:19 loops and optimizations Gero Peterhoff
@ 2019-09-02 10:14 ` Richard Biener
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Biener @ 2019-09-02 10:14 UTC (permalink / raw)
  To: Gero Peterhoff; +Cc: GCC Development

On Mon, Sep 2, 2019 at 9:19 AM Gero Peterhoff <g.peterhoff@t-online.de> wrote:
>
> Hello gcc team,
> I have a big problem with code generation for conditions and loop control and in general.
>
> 1) Once a function is defined as inline, all attributes and pragmas are ignored. There is therefore no control at all for loops within inline functions.
> 2) Only (almost) always conditional jumps are generated:
> - even mini-functions do not produce cmov
> - the jump control itself is completely inefficient and therefore slow, since conditional jumps are not summarized
> - The functionality is always moved to the end of a function and must return to the exit point.
> All this means that a lot of codebloats are generated. It jumps around in confusion. This kill any jump prediction.
> An Example for swap bits in a integer:
>
> #define bitsof(Arg)     std::size_t(sizeof(Arg) * CHAR_BIT)
> using numeric_type      = std::uint8_t;
>
> template <typename Type, typename Bits>
> constexpr bool  is_bits(const Type&, const Bits& bits)  noexcept
> {
>         return bits < bitsof(Type);
> }
>
> template <bool Numeric, typename... Args>
> constexpr bool  all_of(const Args... args)      noexcept
> {
>         static_assert
>         (
>                 sizeof...(Args) <= std::size_t{std::numeric_limits<numeric_type>::max()}        &&
>                 sizeof...(Args) > 0,
>                 "invalid argument count"
>         );
>
>         if constexpr (Numeric)
>                 return (numeric_type(bool(args)) + ...) == numeric_type(sizeof...(Args));
>         else
>                 return (bool(args) && ...);
> }
>
> template <size_t Test, typename Type, typename Bit>
> constexpr Type  swap(const Type& arg, const Bit& bit1, const Bit& bit2) noexcept
> {
>         //      Test 0 = default
>         //      Test 1 = fold
>         //      Test 2 = numeric
>
>         using unsigned_type = std::make_unsigned_t<Type>;
>
>         auto work = [&]() constexpr noexcept -> Type
>         {
>                 unsigned_type
>                         res = arg;
>
>                 res = ((res>>bit1) ^ (res>>bit2)) & unsigned_type{1};
>                 res = (res<<bit1) | (res<<bit2);
>                 res ^= arg;
>                 return res;
>         };
>
>         if constexpr (Test == 2)
>                 return all_of<true>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
>         else if constexpr (Test == 1)
>                 return all_of<false>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
>         else
>                 return (bit1!=bit2 && is_bits(arg, bit1) && is_bits(arg, bit2)) ? work() : arg;
> }

Couldn't get it to compile even with some fiddling.  Pleas always
provide testcases
that can be compiled without editing.

> generates:
>
> 1) default:
> 2 jumps to different targets: 20->28, 26->30
> return back 4e->28
>
> 0000000000000000 <unsigned int silent::bits::swap<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
>     0:  49 89 d0                mov    %rdx,%r8
>     3:  0f b6 16                movzbl (%rsi),%edx
>     6:  41 0f b6 08             movzbl (%r8),%ecx
>     a:  44 8b 07                mov    (%rdi),%r8d
>     d:  80 fa 1f                cmp    $0x1f,%dl
>    10:  0f 96 c0                setbe  %al
>    13:  38 d1                   cmp    %dl,%cl
>    15:  0f b6 f9                movzbl %cl,%edi
>    18:  40 0f 95 c6             setne  %sil
>    1c:  21 f0                   and    %esi,%eax
>    1e:  a8 01                   test   $0x1,%al
>    20:  74 06                   je     28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
>    22:  66 83 ff 1f             cmp    $0x1f,%di
>    26:  76 08                   jbe    30 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x30>
>    28:  44 89 c0                mov    %r8d,%eax
>    2b:  c3                      retq
>    2c:  0f 1f 40 00             nopl   0x0(%rax)
>    30:  c4 c2 6b f7 c0          shrx   %edx,%r8d,%eax
>    35:  c4 c2 73 f7 f0          shrx   %ecx,%r8d,%esi
>    3a:  31 f0                   xor    %esi,%eax
>    3c:  83 e0 01                and    $0x1,%eax
>    3f:  c4 e2 69 f7 d0          shlx   %edx,%eax,%edx
>    44:  c4 e2 71 f7 c8          shlx   %ecx,%eax,%ecx
>    49:  09 ca                   or     %ecx,%edx
>    4b:  41 31 d0                xor    %edx,%r8d
>    4e:  eb d8                   jmp    28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
>
>
> 2) fold-expr:
> 3 jumps with one target: 11,17,1d->3d
> ~25% faster than 1)
>
> 0000000000000000 <unsigned int silent::bits::swap<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
>     0:  0f b6 12                movzbl (%rdx),%edx
>     3:  0f b6 0e                movzbl (%rsi),%ecx
>     6:  44 8b 07                mov    (%rdi),%r8d
>     9:  0f b6 f2                movzbl %dl,%esi
>     c:  89 c8                   mov    %ecx,%eax
>     e:  66 39 ce                cmp    %cx,%si
>    11:  74 2a                   je     3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
>    13:  66 83 f9 1f             cmp    $0x1f,%cx
>    17:  77 24                   ja     3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
>    19:  66 83 fe 1f             cmp    $0x1f,%si
>    1d:  77 1e                   ja     3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
>    1f:  c4 c2 7b f7 c8          shrx   %eax,%r8d,%ecx
>    24:  c4 c2 6b f7 f0          shrx   %edx,%r8d,%esi
>    29:  31 f1                   xor    %esi,%ecx
>    2b:  83 e1 01                and    $0x1,%ecx
>    2e:  c4 e2 79 f7 c1          shlx   %eax,%ecx,%eax
>    33:  c4 e2 69 f7 d1          shlx   %edx,%ecx,%edx
>    38:  09 d0                   or     %edx,%eax
>    3a:  41 31 c0                xor    %eax,%r8d
>    3d:  44 89 c0                mov    %r8d,%eax
>    40:  c3                      retq
>
>
> 3) numeric
> calculate condiditions by numeric:
> 1 jump to 1 target: 29->49
> ~50% faster than 1)
>
> 0000000000000000 <unsigned int silent::bits::swap<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
>     0:  0f b6 0a                movzbl (%rdx),%ecx
>     3:  0f b6 16                movzbl (%rsi),%edx
>     6:  31 c0                   xor    %eax,%eax
>     8:  44 8b 07                mov    (%rdi),%r8d
>     b:  80 f9 1f                cmp    $0x1f,%cl
>     e:  0f 96 c0                setbe  %al
>    11:  31 f6                   xor    %esi,%esi
>    13:  80 fa 1f                cmp    $0x1f,%dl
>    16:  40 0f 96 c6             setbe  %sil
>    1a:  01 f0                   add    %esi,%eax
>    1c:  31 f6                   xor    %esi,%esi
>    1e:  38 ca                   cmp    %cl,%dl
>    20:  40 0f 95 c6             setne  %sil
>    24:  01 f0                   add    %esi,%eax
>    26:  83 f8 03                cmp    $0x3,%eax
>    29:  75 1e                   jne    49 <unsigned int silent::bits::swapx<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x49>
>    2b:  c4 c2 6b f7 c0          shrx   %edx,%r8d,%eax
>    30:  c4 c2 73 f7 f0          shrx   %ecx,%r8d,%esi
>    35:  31 f0                   xor    %esi,%eax
>    37:  83 e0 01                and    $0x1,%eax
>    3a:  c4 e2 69 f7 d0          shlx   %edx,%eax,%edx
>    3f:  c4 e2 71 f7 c8          shlx   %ecx,%eax,%ecx
>    44:  09 ca                   or     %ecx,%edx
>    46:  41 31 d0                xor    %edx,%r8d
>    49:  44 89 c0                mov    %r8d,%eax
>    4c:  c3                      retq
>
>
> but clang can more tricky with adc:
> (code by godbolt)
>
>         mov     rax, rdi
>         xor     ecx, ecx
>          cmp     sil, dl
>          setne   cl
>          cmp     sil, 64
>          adc     ecx, 0
>          cmp     dl, 64
>          adc     ecx, 0
>          cmp     ecx, 3
>          jne     .LBB0_2
>          shrx    rcx, rax, rsi
>          shrx    rdi, rax, rdx
>          xor     edi, ecx
>          and     edi, 1
>          shlx    rcx, rdi, rsi
>          shlx    rdx, rdi, rdx
>          or      rdx, rcx
>          xor     rax, rdx
> .LBB0_2:
>          ret
>
> I do not understand why you still insist on outdated function attributes, rather than on optimization/loop control within a function.
>
> best regards
> Gero

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-09-02 10:14 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-02  7:19 loops and optimizations Gero Peterhoff
2019-09-02 10:14 ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).