Optimizing 32 bits integer manipulation on 8 bit AVR target

public inbox for gcc-help@gcc.gnu.org
 help / color / mirror / Atom feed

* Optimizing 32 bits integer manipulation on 8 bit AVR target
@ 2012-08-30 15:32 Sylvain Leroux
  2012-08-30 18:05 ` Sylvain Leroux
  2012-08-31 22:37 ` Georg-Johann Lay
  0 siblings, 2 replies; 4+ messages in thread
From: Sylvain Leroux @ 2012-08-30 15:32 UTC (permalink / raw)
  To: gcc-help

Hi,

It seems to me that avr-gcc/avr-g++ is producing sub-optimal code for 
the 'f' function in the following source code:

---------8<-----------------------------------
#include <avr/io.h>

void f(uint32_t i) {
     i |= ((uint32_t)(0xFF) << 16);

     /* DDRA is an 8 bit register */
     DDRA = (uint32_t)(i);
     DDRA = (uint32_t)(i>>8);
     DDRA = (uint32_t)(i>>16);
     DDRA = (uint32_t)(i>>24);
}

int main() {
     volatile uint32_t n = 0x01020304;

     f(n);
}
---------8<-----------------------------------
Having compiled with the following options:
avr-gcc c.c -mmcu=attiny2313
             -Os -ffunction-sections -fdata-sections
             -g -Wl,--gc-sections -Wl,--print-gc-sections
             -fipa-cp -fcprop-registers -fweb

... here is the relevant fragment as displayed by avr-objdump. I marked 
with a star (*) all the instruction that appears to be useless:
---------8<-----------------------------------
void f(uint32_t i) {
     i |= ((uint32_t)(0xFF) << 16);
   34:   8f 6f           ori     r24, 0xFF       ; 255

     DDRA = (uint32_t)(i);
   36:   6a bb           out     0x1a, r22       ; 26
     DDRA = (uint32_t)(i>>8);
   38:   27 2f           mov     r18, r23
* 3a:   38 2f           mov     r19, r24
* 3c:   49 2f           mov     r20, r25
* 3e:   55 27           eor     r21, r21
   40:   2a bb           out     0x1a, r18       ; 26
     DDRA = (uint32_t)(i>>16);
   42:   9c 01           movw    r18, r24
* 44:   44 27           eor     r20, r20
* 46:   55 27           eor     r21, r21
   48:   2a bb           out     0x1a, r18       ; 26
     DDRA = (uint32_t)(i>>24);
   4a:   69 2f           mov     r22, r25
* 4c:   77 27           eor     r23, r23
* 4e:   88 27           eor     r24, r24
* 50:   99 27           eor     r25, r25
   52:   6a bb           out     0x1a, r22       ; 26
}
   54:   08 95           ret
---------8<-----------------------------------

Both gcc and g++ produce the same code. And I get the same results both 
with 4.3.5 and 4.7.1

Here is my question:
Is there any option(s) that will help gcc to not produce those extra 
instructions in such case?


Regards,
- Sylvain



-- 
-- Sylvain Leroux
-- sylvain@chicoree.fr
-- http://www.chicoree.fr

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Optimizing 32 bits integer manipulation on 8 bit AVR target
  2012-08-30 15:32 Optimizing 32 bits integer manipulation on 8 bit AVR target Sylvain Leroux
@ 2012-08-30 18:05 ` Sylvain Leroux
  2012-08-31 11:51   ` David Brown
  2012-08-31 22:37 ` Georg-Johann Lay
  1 sibling, 1 reply; 4+ messages in thread
From: Sylvain Leroux @ 2012-08-30 18:05 UTC (permalink / raw)
  To: gcc-help

As a complement to my previous message,

It appears the following C source leads to much better code:

---------8<-----------------------------------
void f(uint32_t i) {
     union __attribute__((__packed__)) {
	uint32_t i;
	struct S { uint8_t a,b,c,d; } s;
     } u;

     u.i = i | ((uint32_t)(0xFF) << 16);

     DDRA = (uint32_t)(u.s.d);
     DDRA = (uint32_t)(u.s.c);
     DDRA = (uint32_t)(u.s.b);
     DDRA = (uint32_t)(u.s.a);
}
---------8<-----------------------------------

avr-objdump:
---------8<-----------------------------------
     union __attribute__((__packed__)) {
         uint32_t i;
         struct S { uint8_t a,b,c,d; } s;
     } u;

     u.i = i | ((uint32_t)(0xFF) << 16);
   58:   af 6f           ori     r26, 0xFF       ; 255

     DDRA = (uint32_t)(u.s.d);
   5a:   ba bb           out     0x1a, r27       ; 26
     DDRA = (uint32_t)(u.s.c);
   5c:   aa bb           out     0x1a, r26       ; 26
     DDRA = (uint32_t)(u.s.b);
   5e:   9a bb           out     0x1a, r25       ; 26
     DDRA = (uint32_t)(u.s.a);
   60:   8a bb           out     0x1a, r24       ; 26
---------8<-----------------------------------

*But*, the C code is no longer portable since I'm using 
"__attribute__((__packed__))". Moreover it requires endianness 
knowledge/assumption.


That's why I was hoping for a command line option allowing gcc to 
perform the same optimization.

- Sylvain


On 08/30/2012 02:54 PM, Sylvain Leroux wrote:
> Hi,
>
> It seems to me that avr-gcc/avr-g++ is producing sub-optimal code for
> the 'f' function in the following source code:
>
> ---------8<-----------------------------------
> #include <avr/io.h>
>
> void f(uint32_t i) {
> i |= ((uint32_t)(0xFF) << 16);
>
> /* DDRA is an 8 bit register */
> DDRA = (uint32_t)(i);
> DDRA = (uint32_t)(i>>8);
> DDRA = (uint32_t)(i>>16);
> DDRA = (uint32_t)(i>>24);
> }
>
> int main() {
> volatile uint32_t n = 0x01020304;
>
> f(n);
> }
> ---------8<-----------------------------------
> Having compiled with the following options:
> avr-gcc c.c -mmcu=attiny2313
> -Os -ffunction-sections -fdata-sections
> -g -Wl,--gc-sections -Wl,--print-gc-sections
> -fipa-cp -fcprop-registers -fweb
>
> ... here is the relevant fragment as displayed by avr-objdump. I marked
> with a star (*) all the instruction that appears to be useless:
> ---------8<-----------------------------------
> void f(uint32_t i) {
> i |= ((uint32_t)(0xFF) << 16);
> 34: 8f 6f ori r24, 0xFF ; 255
>
> DDRA = (uint32_t)(i);
> 36: 6a bb out 0x1a, r22 ; 26
> DDRA = (uint32_t)(i>>8);
> 38: 27 2f mov r18, r23
> * 3a: 38 2f mov r19, r24
> * 3c: 49 2f mov r20, r25
> * 3e: 55 27 eor r21, r21
> 40: 2a bb out 0x1a, r18 ; 26
> DDRA = (uint32_t)(i>>16);
> 42: 9c 01 movw r18, r24
> * 44: 44 27 eor r20, r20
> * 46: 55 27 eor r21, r21
> 48: 2a bb out 0x1a, r18 ; 26
> DDRA = (uint32_t)(i>>24);
> 4a: 69 2f mov r22, r25
> * 4c: 77 27 eor r23, r23
> * 4e: 88 27 eor r24, r24
> * 50: 99 27 eor r25, r25
> 52: 6a bb out 0x1a, r22 ; 26
> }
> 54: 08 95 ret
> ---------8<-----------------------------------
>
> Both gcc and g++ produce the same code. And I get the same results both
> with 4.3.5 and 4.7.1
>
> Here is my question:
> Is there any option(s) that will help gcc to not produce those extra
> instructions in such case?
>
>
> Regards,
> - Sylvain
>
>
>


-- 
-- Sylvain Leroux
-- sylvain@chicoree.fr
-- http://www.chicoree.fr

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Optimizing 32 bits integer manipulation on 8 bit AVR target
  2012-08-30 18:05 ` Sylvain Leroux
@ 2012-08-31 11:51   ` David Brown
  0 siblings, 0 replies; 4+ messages in thread
From: David Brown @ 2012-08-31 11:51 UTC (permalink / raw)
  To: Sylvain Leroux; +Cc: gcc-help

Two points:

avr-gcc is known to produce poor code for a number of 32-bit operations. 
  You can search the bugzilla database for missed optimisations like 
this, in case this particular usage is not already listed (even if the 
avr-gcc porters have limited time to work on such things, they still 
like them to be recorded as bugs).  As a general rule, 32-bit operations 
are not used much on the 8-bit avr, and thus improving the generated 
code is not the highest priority for the developers.

Secondly, why are you worrying about portability for code that is 
clearly avr-specific?  Writing portable code is often a useful aim, but 
when you have code that is tightly tied to particular hardware, there is 
no need to be portable.  This is particularly true for the avr, with its 
separate memory space for flash data - a lot of avr code will not 
compile directly on other architectures (or even on other avr 
compilers).  Thus there is no problem with your second version.


You don't mention what optimisation settings you tried - certainly gcc 
will generate terrible code if you don't have at least -O.  The most 
common settings for avr-gcc are -Os.

One setting that often affects 32-bit code on the avr is 
-fsplit-wide-types.  For some code, you get better results with this 
enabled, but at other times it is best to disable it 
(-fno-split-wide-types).


You may also like to join the avr specific list at
avr-gcc-list@nongnu.org.

mvh.,

David



On 30/08/12 15:05, Sylvain Leroux wrote:
> As a complement to my previous message,
>
> It appears the following C source leads to much better code:
>
> ---------8<-----------------------------------
> void f(uint32_t i) {
> union __attribute__((__packed__)) {
> uint32_t i;
> struct S { uint8_t a,b,c,d; } s;
> } u;
>
> u.i = i | ((uint32_t)(0xFF) << 16);
>
> DDRA = (uint32_t)(u.s.d);
> DDRA = (uint32_t)(u.s.c);
> DDRA = (uint32_t)(u.s.b);
> DDRA = (uint32_t)(u.s.a);
> }
> ---------8<-----------------------------------
>
> avr-objdump:
> ---------8<-----------------------------------
> union __attribute__((__packed__)) {
> uint32_t i;
> struct S { uint8_t a,b,c,d; } s;
> } u;
>
> u.i = i | ((uint32_t)(0xFF) << 16);
> 58: af 6f ori r26, 0xFF ; 255
>
> DDRA = (uint32_t)(u.s.d);
> 5a: ba bb out 0x1a, r27 ; 26
> DDRA = (uint32_t)(u.s.c);
> 5c: aa bb out 0x1a, r26 ; 26
> DDRA = (uint32_t)(u.s.b);
> 5e: 9a bb out 0x1a, r25 ; 26
> DDRA = (uint32_t)(u.s.a);
> 60: 8a bb out 0x1a, r24 ; 26
> ---------8<-----------------------------------
>
> *But*, the C code is no longer portable since I'm using
> "__attribute__((__packed__))". Moreover it requires endianness
> knowledge/assumption.
>
>
> That's why I was hoping for a command line option allowing gcc to
> perform the same optimization.
>
> - Sylvain
>
>
> On 08/30/2012 02:54 PM, Sylvain Leroux wrote:
>> Hi,
>>
>> It seems to me that avr-gcc/avr-g++ is producing sub-optimal code for
>> the 'f' function in the following source code:
>>
>> ---------8<-----------------------------------
>> #include <avr/io.h>
>>
>> void f(uint32_t i) {
>> i |= ((uint32_t)(0xFF) << 16);
>>
>> /* DDRA is an 8 bit register */
>> DDRA = (uint32_t)(i);
>> DDRA = (uint32_t)(i>>8);
>> DDRA = (uint32_t)(i>>16);
>> DDRA = (uint32_t)(i>>24);
>> }
>>
>> int main() {
>> volatile uint32_t n = 0x01020304;
>>
>> f(n);
>> }
>> ---------8<-----------------------------------
>> Having compiled with the following options:
>> avr-gcc c.c -mmcu=attiny2313
>> -Os -ffunction-sections -fdata-sections
>> -g -Wl,--gc-sections -Wl,--print-gc-sections
>> -fipa-cp -fcprop-registers -fweb
>>
>> ... here is the relevant fragment as displayed by avr-objdump. I marked
>> with a star (*) all the instruction that appears to be useless:
>> ---------8<-----------------------------------
>> void f(uint32_t i) {
>> i |= ((uint32_t)(0xFF) << 16);
>> 34: 8f 6f ori r24, 0xFF ; 255
>>
>> DDRA = (uint32_t)(i);
>> 36: 6a bb out 0x1a, r22 ; 26
>> DDRA = (uint32_t)(i>>8);
>> 38: 27 2f mov r18, r23
>> * 3a: 38 2f mov r19, r24
>> * 3c: 49 2f mov r20, r25
>> * 3e: 55 27 eor r21, r21
>> 40: 2a bb out 0x1a, r18 ; 26
>> DDRA = (uint32_t)(i>>16);
>> 42: 9c 01 movw r18, r24
>> * 44: 44 27 eor r20, r20
>> * 46: 55 27 eor r21, r21
>> 48: 2a bb out 0x1a, r18 ; 26
>> DDRA = (uint32_t)(i>>24);
>> 4a: 69 2f mov r22, r25
>> * 4c: 77 27 eor r23, r23
>> * 4e: 88 27 eor r24, r24
>> * 50: 99 27 eor r25, r25
>> 52: 6a bb out 0x1a, r22 ; 26
>> }
>> 54: 08 95 ret
>> ---------8<-----------------------------------
>>
>> Both gcc and g++ produce the same code. And I get the same results both
>> with 4.3.5 and 4.7.1
>>
>> Here is my question:
>> Is there any option(s) that will help gcc to not produce those extra
>> instructions in such case?
>>
>>
>> Regards,
>> - Sylvain
>>
>>
>>
>
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Optimizing 32 bits integer manipulation on 8 bit AVR target
  2012-08-30 15:32 Optimizing 32 bits integer manipulation on 8 bit AVR target Sylvain Leroux
  2012-08-30 18:05 ` Sylvain Leroux
@ 2012-08-31 22:37 ` Georg-Johann Lay
  1 sibling, 0 replies; 4+ messages in thread
From: Georg-Johann Lay @ 2012-08-31 22:37 UTC (permalink / raw)
  To: Sylvain Leroux; +Cc: gcc-help

Sylvain Leroux schrieb:
> Hi,
> 
> It seems to me that avr-gcc/avr-g++ is producing sub-optimal code for 
> the 'f' function in the following source code:
> 
> ---------8<-----------------------------------
> #include <avr/io.h>
> 
> void f(uint32_t i) {
>     i |= ((uint32_t)(0xFF) << 16);
> 
>     /* DDRA is an 8 bit register */
>     DDRA = (uint32_t)(i);
>     DDRA = (uint32_t)(i>>8);
>     DDRA = (uint32_t)(i>>16);
>     DDRA = (uint32_t)(i>>24);
> }
> 
> int main() {
>     volatile uint32_t n = 0x01020304;
> 
>     f(n);
> }
> ---------8<-----------------------------------
> Having compiled with the following options:
> avr-gcc c.c -mmcu=attiny2313
>             -Os -ffunction-sections -fdata-sections
>             -g -Wl,--gc-sections -Wl,--print-gc-sections
>             -fipa-cp -fcprop-registers -fweb
> 
> ... here is the relevant fragment as displayed by avr-objdump. I marked 
> with a star (*) all the instruction that appears to be useless:
> ---------8<-----------------------------------
> void f(uint32_t i) {
>     i |= ((uint32_t)(0xFF) << 16);
>   34:   8f 6f           ori     r24, 0xFF       ; 255
> 
>     DDRA = (uint32_t)(i);
>   36:   6a bb           out     0x1a, r22       ; 26
>     DDRA = (uint32_t)(i>>8);
>   38:   27 2f           mov     r18, r23
> * 3a:   38 2f           mov     r19, r24
> * 3c:   49 2f           mov     r20, r25
> * 3e:   55 27           eor     r21, r21
>   40:   2a bb           out     0x1a, r18       ; 26
>     DDRA = (uint32_t)(i>>16);
>   42:   9c 01           movw    r18, r24
> * 44:   44 27           eor     r20, r20
> * 46:   55 27           eor     r21, r21
>   48:   2a bb           out     0x1a, r18       ; 26
>     DDRA = (uint32_t)(i>>24);
>   4a:   69 2f           mov     r22, r25
> * 4c:   77 27           eor     r23, r23
> * 4e:   88 27           eor     r24, r24
> * 50:   99 27           eor     r25, r25
>   52:   6a bb           out     0x1a, r22       ; 26
> }
>   54:   08 95           ret
> ---------8<-----------------------------------
> 
> Both gcc and g++ produce the same code. And I get the same results both 
> with 4.3.5 and 4.7.1
> 
> Here is my question:
> Is there any option(s) that will help gcc to not produce those extra 
> instructions in such case?

This is PR49807 which is not avr specific.

PR49807 is missed RTL optimization, but other targets like 32-bit
targets typically don't see it because their SFRs are 32 bits wide.

Just compile the following test case and the issue is gone.
If you remove the #define a, b, c, d you see PR49807 again.

Johann

--

typedef unsigned long uint32_t;

#define DDRA (*(volatile unsigned char*) 0x3A)

extern unsigned char a, b, c, d;

#define a DDRA
#define b DDRA
#define c DDRA
#define d DDRA

void f (uint32_t i)
{
     i |= 0xFFul << 16;

     /* DDRA is an 8 bit register */
     a = i;
     b = i >> 8;
     c = i >> 16;
     d = i >> 24;
}


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2012-08-31 19:24 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-08-30 15:32 Optimizing 32 bits integer manipulation on 8 bit AVR target Sylvain Leroux
2012-08-30 18:05 ` Sylvain Leroux
2012-08-31 11:51   ` David Brown
2012-08-31 22:37 ` Georg-Johann Lay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).