public inbox for gcc-help@gcc.gnu.org
 help / color / mirror / Atom feed
* problems with optimisation
@ 2012-12-28 10:25 Kicer
  2012-12-28 15:19 ` Andrew Haley
  0 siblings, 1 reply; 9+ messages in thread
From: Kicer @ 2012-12-28 10:25 UTC (permalink / raw)
  To: gcc-help

Hi all


Last days I've found a problem with some certain code optimisations:


namespace 
{
  
  struct Base;
  
  struct Bit
  {
	  const Base &m_p;
	  const int m_pos;
	  
	  constexpr Bit(const Base &p, const int pos): m_p(p), m_pos(pos)
	  {
	  }
	
	  operator bool() const;  
  };
  
  struct Base
  {  
	  const int m_port;
	  constexpr Base(int p): m_port(p)
	  {
	  }
	  
	  operator char () const
	  {
		  char result;
		    
		  asm(
			"in %%dx, %%al\n"
			:"=a"(result)
			:"d"(m_port)
		  );
		  
		  //result = *(reinterpret_cast<char *>(m_port+32));
		  
		  return result;
	  }
	  
	  Bit operator[] (int p) const
	  {
		  Bit r(*this, p);
		  return r;
	  }
	
  };


  Bit::operator bool() const
  {
	  const char v = m_p;
	  const bool r = (v & (1 << m_pos)) > 0;
	  
	  return r;
  }
  
  struct Anc: public Base
  {
	  const Base m_in;
	  constexpr Anc(int o): Base(o), m_in(o - 1)
	  {
	  }
	  
	  const Base& getIn() const
	  {
		  return m_in;
	  }
	
  };

}

template<int v>
char foo()
{
	Anc p(v), p2(v+2);
	char r = p.getIn() + p2.getIn();
	
	//r += p[0]? 1: 0;                   //commented out at first step
	r += p2[4]? 1 : 0;
	
	return r;
}


char bar()
{
  char r = foo<4>();
  
  r-= foo<6>();
  
  return r;
}

there are 3 structs which looks more complex than the code they generate.
foo() and bar() are just ising those structs.
For the code above output is short and clear as expected:

   0:	ba 03 00 00 00       	mov    $0x3,%edx
   5:	be 01 00 00 00       	mov    $0x1,%esi
   a:	ec                   	in     (%dx),%al
   b:	b2 05                	mov    $0x5,%dl
   d:	41 88 c0             	mov    %al,%r8b
  10:	ec                   	in     (%dx),%al
  11:	b2 06                	mov    $0x6,%dl
  13:	40 88 c7             	mov    %al,%dil
  16:	ec                   	in     (%dx),%al
  17:	b2 07                	mov    $0x7,%dl
  19:	88 c1                	mov    %al,%cl
  1b:	ec                   	in     (%dx),%al
  1c:	b2 08                	mov    $0x8,%dl
  1e:	41 88 c1             	mov    %al,%r9b
  21:	c0 e9 04             	shr    $0x4,%cl
  24:	ec                   	in     (%dx),%al
  25:	c0 e8 04             	shr    $0x4,%al
  28:	41 01 f9             	add    %edi,%r9d
  2b:	83 e1 01             	and    $0x1,%ecx
  2e:	21 c6                	and    %eax,%esi
  30:	42 8d 04 07          	lea    (%rdi,%r8,1),%eax
  34:	44 01 ce             	add    %r9d,%esi
  37:	01 c8                	add    %ecx,%eax
  39:	40 0f be f6          	movsbl %sil,%esi
  3d:	0f be c0             	movsbl %al,%eax
  40:	29 f0                	sub    %esi,%eax
  42:	c3                   	retq   


but when I uncomment "//r += p[0]? 1: 0; " in foo(), the code becomes 
unexpectly large and unclear:

0000000000000000 <_ZNK12_GLOBAL__N_13BitcvbEv>:
   0:	48 8b 07             	mov    (%rdi),%rax
   3:	8b 4f 08             	mov    0x8(%rdi),%ecx
   6:	be 01 00 00 00       	mov    $0x1,%esi
   b:	8b 10                	mov    (%rax),%edx
   d:	d3 e6                	shl    %cl,%esi
   f:	ec                   	in     (%dx),%al
  10:	0f be c0             	movsbl %al,%eax
  13:	85 f0                	test   %esi,%eax
  15:	0f 9f c0             	setg   %al
  18:	c3                   	retq   

0000000000000019 <_Z3barv>:
  19:	53                   	push   %rbx
  1a:	e8 00 00 00 00       	callq  1f <_Z3barv+0x6>
  1f:	88 c3                	mov    %al,%bl
  21:	e8 00 00 00 00       	callq  26 <_Z3barv+0xd>
  26:	0f be d3             	movsbl %bl,%edx
  29:	0f be c0             	movsbl %al,%eax
  2c:	29 c2                	sub    %eax,%edx
  2e:	88 d0                	mov    %dl,%al
  30:	5b                   	pop    %rbx
  31:	c3                   	retq   

Disassembly of section .text._Z3fooILi4EEcv:

0000000000000000 <_Z3fooILi4EEcv>:
   0:	41 54                	push   %r12
   2:	ba 03 00 00 00       	mov    $0x3,%edx
   7:	ec                   	in     (%dx),%al
   8:	55                   	push   %rbp
   9:	b2 05                	mov    $0x5,%dl
   b:	40 88 c5             	mov    %al,%bpl
   e:	ec                   	in     (%dx),%al
   f:	53                   	push   %rbx
  10:	41 88 c4             	mov    %al,%r12b
  13:	41 8d 1c 2c          	lea    (%r12,%rbp,1),%ebx
  17:	48 83 ec 20          	sub    $0x20,%rsp
  1b:	48 8d 04 24          	lea    (%rsp),%rax
  1f:	48 8d 7c 24 10       	lea    0x10(%rsp),%rdi
  24:	c7 04 24 04 00 00 00 	movl   $0x4,(%rsp)
  2b:	c7 44 24 04 03 00 00 	movl   $0x3,0x4(%rsp)
  32:	00 
  33:	c7 44 24 08 06 00 00 	movl   $0x6,0x8(%rsp)
  3a:	00 
  3b:	0f be db             	movsbl %bl,%ebx
  3e:	c7 44 24 0c 05 00 00 	movl   $0x5,0xc(%rsp)
  45:	00 
  46:	48 89 44 24 10       	mov    %rax,0x10(%rsp)
  4b:	c7 44 24 18 00 00 00 	movl   $0x0,0x18(%rsp)
  52:	00 
  53:	e8 00 00 00 00       	callq  58 <_Z3fooILi4EEcv+0x58>
  58:	0f b6 c0             	movzbl %al,%eax
  5b:	48 8d 7c 24 10       	lea    0x10(%rsp),%rdi
  60:	c7 44 24 18 04 00 00 	movl   $0x4,0x18(%rsp)
  67:	00 
  68:	01 c3                	add    %eax,%ebx
  6a:	48 8d 44 24 08       	lea    0x8(%rsp),%rax
  6f:	0f be db             	movsbl %bl,%ebx
  72:	48 89 44 24 10       	mov    %rax,0x10(%rsp)
  77:	e8 00 00 00 00       	callq  7c <_Z3fooILi4EEcv+0x7c>
  7c:	48 83 c4 20          	add    $0x20,%rsp
  80:	0f b6 c0             	movzbl %al,%eax
  83:	01 d8                	add    %ebx,%eax
  85:	5b                   	pop    %rbx
  86:	5d                   	pop    %rbp
  87:	41 5c                	pop    %r12
  89:	c3                   	retq   

Disassembly of section .text._Z3fooILi6EEcv:

0000000000000000 <_Z3fooILi6EEcv>:
   0:	41 54                	push   %r12
   2:	ba 05 00 00 00       	mov    $0x5,%edx
   7:	ec                   	in     (%dx),%al
   8:	55                   	push   %rbp
   9:	b2 07                	mov    $0x7,%dl
   b:	40 88 c5             	mov    %al,%bpl
   e:	ec                   	in     (%dx),%al
   f:	53                   	push   %rbx
  10:	41 88 c4             	mov    %al,%r12b
  13:	41 8d 1c 2c          	lea    (%r12,%rbp,1),%ebx
  17:	48 83 ec 20          	sub    $0x20,%rsp
  1b:	48 8d 04 24          	lea    (%rsp),%rax
  1f:	48 8d 7c 24 10       	lea    0x10(%rsp),%rdi
  24:	c7 04 24 06 00 00 00 	movl   $0x6,(%rsp)
  2b:	c7 44 24 04 05 00 00 	movl   $0x5,0x4(%rsp)
  32:	00 
  33:	c7 44 24 08 08 00 00 	movl   $0x8,0x8(%rsp)
  3a:	00 
  3b:	0f be db             	movsbl %bl,%ebx
  3e:	c7 44 24 0c 07 00 00 	movl   $0x7,0xc(%rsp)
  45:	00 
  46:	48 89 44 24 10       	mov    %rax,0x10(%rsp)
  4b:	c7 44 24 18 00 00 00 	movl   $0x0,0x18(%rsp)
  52:	00 
  53:	e8 00 00 00 00       	callq  58 <_Z3fooILi6EEcv+0x58>
  58:	0f b6 c0             	movzbl %al,%eax
  5b:	48 8d 7c 24 10       	lea    0x10(%rsp),%rdi
  60:	c7 44 24 18 04 00 00 	movl   $0x4,0x18(%rsp)
  67:	00 
  68:	01 c3                	add    %eax,%ebx
  6a:	48 8d 44 24 08       	lea    0x8(%rsp),%rax
  6f:	0f be db             	movsbl %bl,%ebx
  72:	48 89 44 24 10       	mov    %rax,0x10(%rsp)
  77:	e8 00 00 00 00       	callq  7c <_Z3fooILi6EEcv+0x7c>
  7c:	48 83 c4 20          	add    $0x20,%rsp
  80:	0f b6 c0             	movzbl %al,%eax
  83:	01 d8                	add    %ebx,%eax
  85:	5b                   	pop    %rbx
  86:	5d                   	pop    %rbp
  87:	41 5c                	pop    %r12
  89:	c3                   	retq   


compilation flags:
g++ -Os test.cpp -c -o test.o -std=c++11


this may seem to be a less important problem for x86 archs, but I'm affected 
with this problem on avr arch where memory is very limited. Can I somehow 
figure out why gcc resigns from generation clean code in second example?


regards

-- 
Michał Walenciak
gmail.com kicer86
http://kicer.sileman.net.pl
gg: 3729519

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: problems with optimisation
  2012-12-28 10:25 problems with optimisation Kicer
@ 2012-12-28 15:19 ` Andrew Haley
  2012-12-28 16:34   ` David Brown
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Haley @ 2012-12-28 15:19 UTC (permalink / raw)
  To: Kicer; +Cc: gcc-help

On 12/28/2012 10:25 AM, Kicer wrote:
> Hi all
> 
> 
> Last days I've found a problem with some certain code optimisations:
> 
> 
> namespace 
> {
>   
>   struct Base;
>   
>   struct Bit
>   {
> 	  const Base &m_p;
> 	  const int m_pos;
> 	  
> 	  constexpr Bit(const Base &p, const int pos): m_p(p), m_pos(pos)
> 	  {
> 	  }
> 	
> 	  operator bool() const;  
>   };
>   
>   struct Base
>   {  
> 	  const int m_port;
> 	  constexpr Base(int p): m_port(p)
> 	  {
> 	  }
> 	  
> 	  operator char () const
> 	  {
> 		  char result;
> 		    
> 		  asm(
> 			"in %%dx, %%al\n"
> 			:"=a"(result)
> 			:"d"(m_port)
> 		  );
> 		  
> 		  //result = *(reinterpret_cast<char *>(m_port+32));
> 		  
> 		  return result;
> 	  }
> 	  
> 	  Bit operator[] (int p) const
> 	  {
> 		  Bit r(*this, p);
> 		  return r;
> 	  }
> 	
>   };
> 
> 
>   Bit::operator bool() const
>   {
> 	  const char v = m_p;
> 	  const bool r = (v & (1 << m_pos)) > 0;
> 	  
> 	  return r;
>   }
>   
>   struct Anc: public Base
>   {
> 	  const Base m_in;
> 	  constexpr Anc(int o): Base(o), m_in(o - 1)
> 	  {
> 	  }
> 	  
> 	  const Base& getIn() const
> 	  {
> 		  return m_in;
> 	  }
> 	
>   };
> 
> }
> 
> template<int v>
> char foo()
> {
> 	Anc p(v), p2(v+2);
> 	char r = p.getIn() + p2.getIn();
> 	
> 	//r += p[0]? 1: 0;                   //commented out at first step
> 	r += p2[4]? 1 : 0;
> 	
> 	return r;
> }
> 
> 
> char bar()
> {
>   char r = foo<4>();
>   
>   r-= foo<6>();
>   
>   return r;
> }
> 
> there are 3 structs which looks more complex than the code they generate.
> foo() and bar() are just ising those structs.
> For the code above output is short and clear as expected: 
> 
> but when I uncomment "//r += p[0]? 1: 0; " in foo(), the code becomes 
> unexpectly large and unclear:
> 

> 
> compilation flags:
> g++ -Os test.cpp -c -o test.o -std=c++11
> 
> 
> this may seem to be a less important problem for x86 archs, but I'm affected 
> with this problem on avr arch where memory is very limited. Can I somehow 
> figure out why gcc resigns from generation clean code in second example?

With -O2 there's much less difference:

bar():								bar():
.LFB14:								.LFB14:
	.cfi_startproc							.cfi_startproc
	movl	$3, %edx						movl	$3, %edx
	in %dx, %al							in %dx, %al

	movb	$6, %dl					      |		movb	$4, %dl
	movl	%eax, %ecx						movl	%eax, %ecx
	in %dx, %al							in %dx, %al

							      >		movb	$6, %dl
							      >		movl	%eax, %edi
							      >		in %dx, %al
							      >
	movb	$7, %dl							movb	$7, %dl
	movl	%eax, %esi						movl	%eax, %esi
							      >		andl	$1, %edi
	in %dx, %al							in %dx, %al

	movl	%eax, %edi				      |		movl	%eax, %r8d
							      >		movsbl	%sil, %esi
	movb	$8, %dl							movb	$8, %dl
	subb	%dil, %cl				      |		subb	%r8b, %cl
	in %dx, %al							in %dx, %al

	andl	$16, %esi				      |		addl	%edi, %ecx
							      >		testb	$16, %sil
	setne	%dl							setne	%dl
							      >		andl	$1, %esi
	addl	%edx, %ecx						addl	%edx, %ecx
							      >		subb	%sil, %cl
	testb	$16, %al						testb	$16, %al
	setne	%al							setne	%al
	subb	%al, %cl						subb	%al, %cl
	movl	%ecx, %eax						movl	%ecx, %eax
	ret								ret


Without inlining GCC can't tell what your program is doing, and by using
-Os you're preventing GCC from inlining.

Andrew.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: problems with optimisation
  2012-12-28 15:19 ` Andrew Haley
@ 2012-12-28 16:34   ` David Brown
  2012-12-28 17:14     ` Kicer
  0 siblings, 1 reply; 9+ messages in thread
From: David Brown @ 2012-12-28 16:34 UTC (permalink / raw)
  To: Andrew Haley; +Cc: Kicer, gcc-help

On 28/12/12 16:19, Andrew Haley wrote:
> On 12/28/2012 10:25 AM, Kicer wrote:
>> Hi all
>>
>>
>> Last days I've found a problem with some certain code optimisations:
>>
>>
>> namespace 
>> {
>>   
>>   struct Base;
>>   
>>   struct Bit
>>   {
>> 	  const Base &m_p;
>> 	  const int m_pos;
>> 	  
>> 	  constexpr Bit(const Base &p, const int pos): m_p(p), m_pos(pos)
>> 	  {
>> 	  }
>> 	
>> 	  operator bool() const;  
>>   };
>>   
>>   struct Base
>>   {  
>> 	  const int m_port;
>> 	  constexpr Base(int p): m_port(p)
>> 	  {
>> 	  }
>> 	  
>> 	  operator char () const
>> 	  {
>> 		  char result;
>> 		    
>> 		  asm(
>> 			"in %%dx, %%al\n"
>> 			:"=a"(result)
>> 			:"d"(m_port)
>> 		  );
>> 		  
>> 		  //result = *(reinterpret_cast<char *>(m_port+32));
>> 		  
>> 		  return result;
>> 	  }
>> 	  
>> 	  Bit operator[] (int p) const
>> 	  {
>> 		  Bit r(*this, p);
>> 		  return r;
>> 	  }
>> 	
>>   };
>>
>>
>>   Bit::operator bool() const
>>   {
>> 	  const char v = m_p;
>> 	  const bool r = (v & (1 << m_pos)) > 0;
>> 	  
>> 	  return r;
>>   }
>>   
>>   struct Anc: public Base
>>   {
>> 	  const Base m_in;
>> 	  constexpr Anc(int o): Base(o), m_in(o - 1)
>> 	  {
>> 	  }
>> 	  
>> 	  const Base& getIn() const
>> 	  {
>> 		  return m_in;
>> 	  }
>> 	
>>   };
>>
>> }
>>
>> template<int v>
>> char foo()
>> {
>> 	Anc p(v), p2(v+2);
>> 	char r = p.getIn() + p2.getIn();
>> 	
>> 	//r += p[0]? 1: 0;                   //commented out at first step
>> 	r += p2[4]? 1 : 0;
>> 	
>> 	return r;
>> }
>>
>>
>> char bar()
>> {
>>   char r = foo<4>();
>>   
>>   r-= foo<6>();
>>   
>>   return r;
>> }
>>
>> there are 3 structs which looks more complex than the code they generate.
>> foo() and bar() are just ising those structs.
>> For the code above output is short and clear as expected: 
>>
>> but when I uncomment "//r += p[0]? 1: 0; " in foo(), the code becomes 
>> unexpectly large and unclear:
>>
> 
>>
>> compilation flags:
>> g++ -Os test.cpp -c -o test.o -std=c++11
>>
>>
>> this may seem to be a less important problem for x86 archs, but I'm affected 
>> with this problem on avr arch where memory is very limited. Can I somehow 
>> figure out why gcc resigns from generation clean code in second example?
> 
> With -O2 there's much less difference:
> 
> bar():								bar():
> .LFB14:								.LFB14:
> 	.cfi_startproc							.cfi_startproc
> 	movl	$3, %edx						movl	$3, %edx
> 	in %dx, %al							in %dx, %al
> 
> 	movb	$6, %dl					      |		movb	$4, %dl
> 	movl	%eax, %ecx						movl	%eax, %ecx
> 	in %dx, %al							in %dx, %al
> 
> 							      >		movb	$6, %dl
> 							      >		movl	%eax, %edi
> 							      >		in %dx, %al
> 							      >
> 	movb	$7, %dl							movb	$7, %dl
> 	movl	%eax, %esi						movl	%eax, %esi
> 							      >		andl	$1, %edi
> 	in %dx, %al							in %dx, %al
> 
> 	movl	%eax, %edi				      |		movl	%eax, %r8d
> 							      >		movsbl	%sil, %esi
> 	movb	$8, %dl							movb	$8, %dl
> 	subb	%dil, %cl				      |		subb	%r8b, %cl
> 	in %dx, %al							in %dx, %al
> 
> 	andl	$16, %esi				      |		addl	%edi, %ecx
> 							      >		testb	$16, %sil
> 	setne	%dl							setne	%dl
> 							      >		andl	$1, %esi
> 	addl	%edx, %ecx						addl	%edx, %ecx
> 							      >		subb	%sil, %cl
> 	testb	$16, %al						testb	$16, %al
> 	setne	%al							setne	%al
> 	subb	%al, %cl						subb	%al, %cl
> 	movl	%ecx, %eax						movl	%ecx, %eax
> 	ret								ret
> 
> 
> Without inlining GCC can't tell what your program is doing, and by using
> -Os you're preventing GCC from inlining.
> 
> Andrew.
> 

There are normally good reasons for picking -Os rather than -O2 for
small microcontrollers (the OP is targeting AVRs, which typically have
quite small program flash memories).

So the solution here is to manually declare the various functions as
"inline" (or at least "static", so that the compiler will inline them
automatically).  Very often, code that manipulates bits is horrible on a
target like the AVR if the function is not inline, and the compiler has
the bit number(s) as variables - but with inline code generation and
constant folding, you end up with only an instruction or two for
compile-time constant bit numbers.

(To the OP) - also note that there can be significant differences in the
types of code generation and optimisations for different backends.  I
assume you posted x86 assembly because you thought it would be more
familiar to people on this list, but I think it would be more important
to show the real assembly from the target you are using as you might see
different optimisations or missed optimisations.

Finally, there is a mailing list dedicated to gcc on the avr - it might
be worth posting there too, especially if you think the issue is
avr-specific.

David


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: problems with optimisation
  2012-12-28 16:34   ` David Brown
@ 2012-12-28 17:14     ` Kicer
  2012-12-29 16:26       ` David Brown
  0 siblings, 1 reply; 9+ messages in thread
From: Kicer @ 2012-12-28 17:14 UTC (permalink / raw)
  To: David Brown; +Cc: Andrew Haley, gcc-help

Dnia piątek 28 grudnia 2012 17:33:59 David Brown pisze:
> On 28/12/12 16:19, Andrew Haley wrote:
> > With -O2 there's much less difference:
> > 
> > bar():								bar():
> > 
> > .LFB14:								.LFB14:
> > 	.cfi_startproc							.cfi_startproc
> > 	movl	$3, %edx						movl	$3, %edx
> > 	in %dx, %al							in %dx, %al
> > 	
> > 	movb	$6, %dl					      |		movb	$4, %dl
> > 	movl	%eax, %ecx						movl	%eax, %ecx
> > 	in %dx, %al							in %dx, %al
> > 	
> > 							      >		movb	$6, %dl
> > 							      >		movl	%eax, %edi
> > 							      >		in %dx, %al
> > 	
> > 	movb	$7, %dl							movb	$7, %dl
> > 	movl	%eax, %esi						movl	%eax, %esi
> > 	
> > 							      >		andl	$1, %edi
> > 	
> > 	in %dx, %al							in %dx, %al
> > 	
> > 	movl	%eax, %edi				      |		movl	%eax, %r8d
> > 	
> > 							      >		movsbl	%sil, %esi
> > 	
> > 	movb	$8, %dl							movb	$8, %dl
> > 	subb	%dil, %cl				      |		subb	%r8b, %cl
> > 	in %dx, %al							in %dx, %al
> > 	
> > 	andl	$16, %esi				      |		addl	%edi, %ecx
> > 	
> > 							      >		testb	$16, %sil
> > 	
> > 	setne	%dl							setne	%dl
> > 	
> > 							      >		andl	$1, %esi
> > 	
> > 	addl	%edx, %ecx						addl	%edx, %ecx
> > 	
> > 							      >		subb	%sil, %cl
> > 	
> > 	testb	$16, %al						testb	$16, %al
> > 	setne	%al							setne	%al
> > 	subb	%al, %cl						subb	%al, %cl
> > 	movl	%ecx, %eax						movl	%ecx, %eax
> > 	ret								ret
> > 
> > Without inlining GCC can't tell what your program is doing, and by using
> > -Os you're preventing GCC from inlining.
> > 
> > Andrew.
> 
> There are normally good reasons for picking -Os rather than -O2 for
> small microcontrollers (the OP is targeting AVRs, which typically have
> quite small program flash memories).
> 
> So the solution here is to manually declare the various functions as
> "inline" (or at least "static", so that the compiler will inline them
> automatically).  Very often, code that manipulates bits is horrible on a
> target like the AVR if the function is not inline, and the compiler has
> the bit number(s) as variables - but with inline code generation and
> constant folding, you end up with only an instruction or two for
> compile-time constant bit numbers.
> 
> (To the OP) - also note that there can be significant differences in the
> types of code generation and optimisations for different backends.  I
> assume you posted x86 assembly because you thought it would be more
> familiar to people on this list, but I think it would be more important
> to show the real assembly from the target you are using as you might see
> different optimisations or missed optimisations.
> 
> Finally, there is a mailing list dedicated to gcc on the avr - it might
> be worth posting there too, especially if you think the issue is
> avr-specific.
> 
> David

David: you are right - I used x86 due to its popularity ;)

In my real case I'm observing weird thigs (speaking of inline): 

1. when in my code I use -Os and inline functions - gcc doesn't inline code 
(and AFAIR, generates warning about it wont't inline because code would 
grown).
Code looks funny then:

00000044 
<_ZNK7OneWire14InterruptBasedILt56ELh4EE10releaseBusEv.isra.0.1569.1517>:
  44:	bc 98       	cbi	0x17, 4	; 23
  46:	08 95       	ret


plus a few calls like:
rcall	.-262    	; 0x44 
<_ZNK7OneWire14InterruptBasedILt56ELh4EE10releaseBusEv.isra.0.1569.1517>


those calls are completly useless as 'cbi' could be placed instead of them, 
and the whole function actually consists of 1 command (except ret).
This is quite important for me as I loose certain amount of clock ticks here 
:)

2. when I use -Os and always_inline attribute, I get a messy code like in my 
first message (program gets bigger by 70%, and uses 2-3x more stack which is 
half of available memory).


It's hard to place whole avr program here as it's big, and it's difficult to 
introduce a smaller exmaple, because it's getting messy only when program gets 
bigger.

Andrew: it's inconvenient to use O2 as Os produces a progam which size is 30% 
of O2's result.

regards

-- 
Michał Walenciak
gmail.com kicer86
http://kicer.sileman.net.pl
gg: 3729519

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: problems with optimisation
  2012-12-28 17:14     ` Kicer
@ 2012-12-29 16:26       ` David Brown
  2012-12-29 22:28         ` Ángel González
  2012-12-30  9:47         ` Kicer
  0 siblings, 2 replies; 9+ messages in thread
From: David Brown @ 2012-12-29 16:26 UTC (permalink / raw)
  To: Kicer; +Cc: Andrew Haley, gcc-help

On 28/12/12 18:14, Kicer wrote:
> Dnia piÄ…tek 28 grudnia 2012 17:33:59 David Brown pisze:
>> On 28/12/12 16:19, Andrew Haley wrote:
>>> With -O2 there's much less difference:
>>>
>>> bar():								bar():
>>>
>>> .LFB14:								.LFB14:
>>> 	.cfi_startproc							.cfi_startproc
>>> 	movl	$3, %edx						movl	$3, %edx
>>> 	in %dx, %al							in %dx, %al
>>> 	
>>> 	movb	$6, %dl					      |		movb	$4, %dl
>>> 	movl	%eax, %ecx						movl	%eax, %ecx
>>> 	in %dx, %al							in %dx, %al
>>> 	
>>> 							      >		movb	$6, %dl
>>> 							      >		movl	%eax, %edi
>>> 							      >		in %dx, %al
>>> 	
>>> 	movb	$7, %dl							movb	$7, %dl
>>> 	movl	%eax, %esi						movl	%eax, %esi
>>> 	
>>> 							      >		andl	$1, %edi
>>> 	
>>> 	in %dx, %al							in %dx, %al
>>> 	
>>> 	movl	%eax, %edi				      |		movl	%eax, %r8d
>>> 	
>>> 							      >		movsbl	%sil, %esi
>>> 	
>>> 	movb	$8, %dl							movb	$8, %dl
>>> 	subb	%dil, %cl				      |		subb	%r8b, %cl
>>> 	in %dx, %al							in %dx, %al
>>> 	
>>> 	andl	$16, %esi				      |		addl	%edi, %ecx
>>> 	
>>> 							      >		testb	$16, %sil
>>> 	
>>> 	setne	%dl							setne	%dl
>>> 	
>>> 							      >		andl	$1, %esi
>>> 	
>>> 	addl	%edx, %ecx						addl	%edx, %ecx
>>> 	
>>> 							      >		subb	%sil, %cl
>>> 	
>>> 	testb	$16, %al						testb	$16, %al
>>> 	setne	%al							setne	%al
>>> 	subb	%al, %cl						subb	%al, %cl
>>> 	movl	%ecx, %eax						movl	%ecx, %eax
>>> 	ret								ret
>>>
>>> Without inlining GCC can't tell what your program is doing, and by using
>>> -Os you're preventing GCC from inlining.
>>>
>>> Andrew.
>>
>> There are normally good reasons for picking -Os rather than -O2 for
>> small microcontrollers (the OP is targeting AVRs, which typically have
>> quite small program flash memories).
>>
>> So the solution here is to manually declare the various functions as
>> "inline" (or at least "static", so that the compiler will inline them
>> automatically).  Very often, code that manipulates bits is horrible on a
>> target like the AVR if the function is not inline, and the compiler has
>> the bit number(s) as variables - but with inline code generation and
>> constant folding, you end up with only an instruction or two for
>> compile-time constant bit numbers.
>>
>> (To the OP) - also note that there can be significant differences in the
>> types of code generation and optimisations for different backends.  I
>> assume you posted x86 assembly because you thought it would be more
>> familiar to people on this list, but I think it would be more important
>> to show the real assembly from the target you are using as you might see
>> different optimisations or missed optimisations.
>>
>> Finally, there is a mailing list dedicated to gcc on the avr - it might
>> be worth posting there too, especially if you think the issue is
>> avr-specific.
>>
>> David
>
> David: you are right - I used x86 due to its popularity ;)
>
> In my real case I'm observing weird thigs (speaking of inline):
>
> 1. when in my code I use -Os and inline functions - gcc doesn't inline code
> (and AFAIR, generates warning about it wont't inline because code would
> grown).
> Code looks funny then:
>
> 00000044
> <_ZNK7OneWire14InterruptBasedILt56ELh4EE10releaseBusEv.isra.0.1569.1517>:
>    44:	bc 98       	cbi	0x17, 4	; 23
>    46:	08 95       	ret
>
>
> plus a few calls like:
> rcall	.-262    	; 0x44
> <_ZNK7OneWire14InterruptBasedILt56ELh4EE10releaseBusEv.isra.0.1569.1517>
>
>
> those calls are completly useless as 'cbi' could be placed instead of them,
> and the whole function actually consists of 1 command (except ret).
> This is quite important for me as I loose certain amount of clock ticks here
> :)
>
> 2. when I use -Os and always_inline attribute, I get a messy code like in my
> first message (program gets bigger by 70%, and uses 2-3x more stack which is
> half of available memory).
>
>
> It's hard to place whole avr program here as it's big, and it's difficult to
> introduce a smaller exmaple, because it's getting messy only when program gets
> bigger.
>
> Andrew: it's inconvenient to use O2 as Os produces a progam which size is 30%
> of O2's result.
>
> regards
>

With -Os, the compiler will obey normal "inline" directives (at least, 
that is my experience when compiling C on the avr - I have not tried C++ 
much on it).  It won't do any automatic extra inlining, except for 
static functions that are only used once - which are always inlined as 
this saves space.  Again, I don't know how that plays with template 
functions or other C++ features.

As far as I know, gcc uses weighting heuristics to decide whether to do 
something the rcall you mentioned above, compared to using the inlined 
code directly.  It is certainly not impossible that the weightings are 
not optimal here.

There is currently very little use of C++ with avr-gcc.  The avr port 
maintainers and the avrlibc developers have little experience with C++, 
and feel they have enough to do with just the C support.  But there are 
a few people on the avr-gcc mailing list that work with C++, and it is 
certainly worth posting there too - they may be able to give suggestions.

<https://lists.nongnu.org/mailman/listinfo/avr-gcc-list>

mvh.,

David

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: problems with optimisation
  2012-12-29 16:26       ` David Brown
@ 2012-12-29 22:28         ` Ángel González
  2012-12-30  9:51           ` Kicer
  2012-12-30 11:25           ` David Brown
  2012-12-30  9:47         ` Kicer
  1 sibling, 2 replies; 9+ messages in thread
From: Ángel González @ 2012-12-29 22:28 UTC (permalink / raw)
  To: David Brown; +Cc: Kicer, Andrew Haley, gcc-help

On 29/12/12 17:26, David Brown wrote:
> With -Os, the compiler will obey normal "inline" directives (at least,
> that is my experience when compiling C on the avr - I have not tried
> C++ much on it).  It won't do any automatic extra inlining, except for
> static functions that are only used once - which are always inlined as
> this saves space.  Again, I don't know how that plays with template
> functions or other C++ features.
>
> As far as I know, gcc uses weighting heuristics to decide whether to
> do something the rcall you mentioned above, compared to using the
> inlined code directly.  It is certainly not impossible that the
> weightings are not optimal here.
>
> There is currently very little use of C++ with avr-gcc.  The avr port
> maintainers and the avrlibc developers have little experience with
> C++, and feel they have enough to do with just the C support.  But
> there are a few people on the avr-gcc mailing list that work with C++,
> and it is certainly worth posting there too - they may be able to give
> suggestions.
>
> <https://lists.nongnu.org/mailman/listinfo/avr-gcc-list>
>
> mvh.,
>
> David

I got good results (code apparently better) using -O3 in avr instead of
-Os. Just the skipped instructions in the prologue and epiloques may be
worth it. It may that since on avr you have one cycle per instruction
(except branches), when optimizing for speed, you indirectly also
optimize the number of instructions. However, I was using C, not C++, so
the different way of coding could lead to worse optimizations.
I recommend giving gcc as much information as possible, and watch the
generated code. I got gcc to perform a few tricky optimizations, and in
one case, I manually unrolled a loop for him (otherwise, it didn't
notice it could be optimized). If you see a very bad instance of code
generation, open a bug. :)
What difference do you have from -Os to -O3 ?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: problems with optimisation
  2012-12-29 16:26       ` David Brown
  2012-12-29 22:28         ` Ángel González
@ 2012-12-30  9:47         ` Kicer
  1 sibling, 0 replies; 9+ messages in thread
From: Kicer @ 2012-12-30  9:47 UTC (permalink / raw)
  To: David Brown; +Cc: Andrew Haley, gcc-help

David Brown wrote:
> 
> With -Os, the compiler will obey normal "inline" directives (at least,
> that is my experience when compiling C on the avr - I have not tried C++
> much on it).  It won't do any automatic extra inlining, except for
> static functions that are only used once - which are always inlined as
> this saves space.  Again, I don't know how that plays with template
> functions or other C++ features.
> 
> As far as I know, gcc uses weighting heuristics to decide whether to do
> something the rcall you mentioned above, compared to using the inlined
> code directly.  It is certainly not impossible that the weightings are
> not optimal here.
> 
> There is currently very little use of C++ with avr-gcc.  The avr port
> maintainers and the avrlibc developers have little experience with C++,
> and feel they have enough to do with just the C support.  But there are
> a few people on the avr-gcc mailing list that work with C++, and it is
> certainly worth posting there too - they may be able to give suggestions.
> 
> <https://lists.nongnu.org/mailman/listinfo/avr-gcc-list>
> 
> mvh.,
> 
> David

I see, thx, I'll try there

-- 
Michał Walenciak
gmail.com kicer86
http://kicer.sileman.net.pl
gg: 3729519

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: problems with optimisation
  2012-12-29 22:28         ` Ángel González
@ 2012-12-30  9:51           ` Kicer
  2012-12-30 11:25           ` David Brown
  1 sibling, 0 replies; 9+ messages in thread
From: Kicer @ 2012-12-30  9:51 UTC (permalink / raw)
  To: Ángel González; +Cc: David Brown, Andrew Haley, gcc-help

Ángel González wrote:
> 
> I got good results (code apparently better) using -O3 in avr instead of
> -Os. Just the skipped instructions in the prologue and epiloques may be
> worth it. It may that since on avr you have one cycle per instruction
> (except branches), when optimizing for speed, you indirectly also
> optimize the number of instructions. However, I was using C, not C++, so
> the different way of coding could lead to worse optimizations.
> I recommend giving gcc as much information as possible, and watch the
> generated code. I got gcc to perform a few tricky optimizations, and in
> one case, I manually unrolled a loop for him (otherwise, it didn't
> notice it could be optimized). If you see a very bad instance of code
> generation, open a bug. :)
> What difference do you have from -Os to -O3 ?

Very often I also prefer O3 over Os, but this time Os seems to generate beter 
results.
That's probably because I leart how to program on avr using c++ (templates 
rules ;) ) and my program is almost always perfectly optimized. Only sometimes 
I get some weird behaviour when code becomes bigger.

regards

-- 
Michał Walenciak
gmail.com kicer86
http://kicer.sileman.net.pl
gg: 3729519

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: problems with optimisation
  2012-12-29 22:28         ` Ángel González
  2012-12-30  9:51           ` Kicer
@ 2012-12-30 11:25           ` David Brown
  1 sibling, 0 replies; 9+ messages in thread
From: David Brown @ 2012-12-30 11:25 UTC (permalink / raw)
  To: Ángel González; +Cc: Kicer, Andrew Haley, gcc-help

On 29/12/12 23:26, Ángel González wrote:
> On 29/12/12 17:26, David Brown wrote:
>> With -Os, the compiler will obey normal "inline" directives (at least,
>> that is my experience when compiling C on the avr - I have not tried
>> C++ much on it).  It won't do any automatic extra inlining, except for
>> static functions that are only used once - which are always inlined as
>> this saves space.  Again, I don't know how that plays with template
>> functions or other C++ features.
>>
>> As far as I know, gcc uses weighting heuristics to decide whether to
>> do something the rcall you mentioned above, compared to using the
>> inlined code directly.  It is certainly not impossible that the
>> weightings are not optimal here.
>>
>> There is currently very little use of C++ with avr-gcc.  The avr port
>> maintainers and the avrlibc developers have little experience with
>> C++, and feel they have enough to do with just the C support.  But
>> there are a few people on the avr-gcc mailing list that work with C++,
>> and it is certainly worth posting there too - they may be able to give
>> suggestions.
>>
>> <https://lists.nongnu.org/mailman/listinfo/avr-gcc-list>
>>
>> mvh.,
>>
>> David
>
> I got good results (code apparently better) using -O3 in avr instead of
> -Os. Just the skipped instructions in the prologue and epiloques may be
> worth it. It may that since on avr you have one cycle per instruction
> (except branches), when optimizing for speed, you indirectly also
> optimize the number of instructions. However, I was using C, not C++, so
> the different way of coding could lead to worse optimizations.

It is not always easy to guess the best choice of optimisation flags. 
You are right that on the avr, small often means fast - and 
optimisations that first appear to make code larger (such as inlining 
functions that are used more than once, or loop unrolling) can lead to 
smaller code by avoiding prologues/epilogues, function call overheads, 
and other "bookkeeping" code.  Theoretically, the compiler knows this 
and will pick the smaller code with -Os.  In practice, it is a very hard 
problem, and there is a limit to the complexity (and accuracy) that can 
be achieved here.

On the bright side, gcc seems to be getting steadily smarter about these 
things - gcc 4.7 does partial function inlining and function 
specialisation in some cases.

Personally, I would like to see the distinction between "optimise for 
speed" and "optimise for size" disappear - or at least be reduced to a 
specialised flag (meaning "I /really/ don't care about speed - just make 
the code as small as possible", and vice-versa).  There are several 
reasons for this:

On modern "big" cpus, small means fast because small fits the fastest 
cache levels (including branch target buffers, prefetch buffers, etc.) 
best.  On an old 386 cpu it might make sense to unroll a loop - on an i7 
the fastest code will have the loop intact (unless unrolling gives 
additional optimisations).  And now the 386 will be deprecated...

On small cpus (like the avr), fast means small because fast means 
running fewer instructions.

In cases where it makes sense to bias on the side of size or speed, 
programmers are notoriously bad at making such decisions themselves. 
Hands up all developers who always profile their code before deciding 
which bits need optimisations :-)  The compiler, on the other hand, can 
do a reasonable job in many cases (see the -fipa-profile flag for an 
example).


On big cpus, the normal optimisation choice should be "make this code as 
fast as possible on this processor, maintaining all standards".  Other 
sensible options are "as fast as possible disregarding the fine print of 
IEEE standards" (the "-Ofast" flag), and "as fast as possible but still 
easy to debug" (the "-Og" flag).

On small cpus, the ideal flag would be something like "as fast as 
possible, but fitting within 32K code memory" - but I don't see that 
coming in the next version or two of gcc...



> I recommend giving gcc as much information as possible, and watch the
> generated code. I got gcc to perform a few tricky optimizations, and in
> one case, I manually unrolled a loop for him (otherwise, it didn't
> notice it could be optimized). If you see a very bad instance of code
> generation, open a bug. :)
> What difference do you have from -Os to -O3 ?
>

The more information the compiler gets, the better.  In particular, you 
always get better results if you can make your functions (and data) 
static - if the compiler can see that the functions don't escape (by 
taking their addresses), it can do far more optimisations.

mvh.,

David

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2012-12-30 11:25 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-12-28 10:25 problems with optimisation Kicer
2012-12-28 15:19 ` Andrew Haley
2012-12-28 16:34   ` David Brown
2012-12-28 17:14     ` Kicer
2012-12-29 16:26       ` David Brown
2012-12-29 22:28         ` Ángel González
2012-12-30  9:51           ` Kicer
2012-12-30 11:25           ` David Brown
2012-12-30  9:47         ` Kicer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).