m68k - GCC 4.4.0 generates not so good code from asm inline

public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed

* m68k - GCC 4.4.0 generates not so good code from asm inline
@ 2009-07-29 11:41 ami_stuff
  2009-07-29 16:57 ` ami_stuff
  0 siblings, 1 reply; 3+ messages in thread
From: ami_stuff @ 2009-07-29 11:41 UTC (permalink / raw)
  To: gcc

Hi,

Here is a C source code which I compiled with GCC 3.4.0 and GCC 4.4.0. GCC 3.4.0 output looks a lot better.

#include <stdio.h>
#include <stdint.h>

#define umul_ppmm(xh, xl, a, b) \
__asm__ ("| Inlined umul_ppmm\n" \
" move.l %0,%/d5\n" \
" move.l %1,%/d4\n" \
" moveq #16,%/d3\n" \
" move.l %0,%/d2\n" \
" mulu %1,%0\n" \
" lsr.l %/d3,%/d4\n" \
" lsr.l %/d3,%/d5\n" \
" mulu %/d4,%/d2\n" \
" mulu %/d5,%1\n" \
" mulu %/d5,%/d4\n" \
" move.l %/d2,%/d5\n" \
" lsr.l %/d3,%/d2\n" \
" add.w %1,%/d5\n" \
" addx.l %/d2,%/d4\n" \
" lsl.l %/d3,%/d5\n" \
" lsr.l %/d3,%1\n" \
" add.l %/d5,%0\n" \
" addx.l %/d4,%1" \
: "=d" ((uint32_t) (xl)), "=d" ((uint32_t) (xh)) \
: "0" ((uint32_t) (a)), "1" ((uint32_t) (b)) \
: "d2", "d3", "d4", "d5")

inline int64_t MUL64(int a, int b)
{
uint32_t au = a;
uint32_t bu = b;

uint32_t resh, resl;
uint64_t res;

umul_ppmm(resh, resl, au, bu);

if (a < 0)
resh -= bu;
if (b < 0)
resh -= au;

res = ((uint64_t)resh << 32) | resl;

return res;
} 


GCC 4.4.0 asm output:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16128,-(sp)
move.l 28(sp),d0
move.l 32(sp),a0
move.l d0,d6
move.l a0,d1
#APP
;# 36 "mul642.c" 1
| Inlined umul_ppmm
move.l d6,d5
move.l d1,d4
moveq #16,d3
move.l d6,d2
mulu d1,d6
lsr.l d3,d4
lsr.l d3,d5
mulu d4,d2
mulu d5,d1
mulu d5,d4
move.l d2,d5
lsr.l d3,d2
add.w d1,d5
addx.l d2,d4
lsl.l d3,d5
lsr.l d3,d1
add.l d5,d6
addx.l d4,d1
#NO_APP
tst.l d0
jlt L6
tst.l a0
jlt L7
L3:
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L7:
sub.l d0,d1
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L6:
sub.l a0,d1
tst.l a0
jge L3
jra L7 

GCC 3.4.0 asm output:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
moveml #0x3f00,sp@-
movel sp@(28),d1
movel sp@(32),d0
movel d1,d7
movel d0,d6
#APP
| Inlined umul_ppmm
move.l d7,d5
move.l d6,d4
moveq #16,d3
move.l d7,d2
mulu d6,d7
lsr.l d3,d4
lsr.l d3,d5
mulu d4,d2
mulu d5,d6
mulu d5,d4
move.l d2,d5
lsr.l d3,d2
add.w d6,d5
addx.l d2,d4
lsl.l d3,d5
lsr.l d3,d6
add.l d5,d7
addx.l d4,d6
#NO_APP
tstl d1
jlt L5
tstl d0
jge L3
jra L6
.even
L5:
subl d0,d6
tstl d0
jge L3
.even
L6:
subl d1,d6
.even
L3:
movel d6,d0
clrl d1
orl d7,d1
moveml sp@+,#0xfc
rts 

Is it a regression?

Regards

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: m68k - GCC 4.4.0 generates not so good code from asm inline
  2009-07-29 11:41 m68k - GCC 4.4.0 generates not so good code from asm inline ami_stuff
@ 2009-07-29 16:57 ` ami_stuff
  2009-07-29 17:12   ` Bernd Roesch
  0 siblings, 1 reply; 3+ messages in thread
From: ami_stuff @ 2009-07-29 16:57 UTC (permalink / raw)
  To: ami_stuff; +Cc: gcc

When I use -O1 with GCC 4.4.0 (-m68060 -fomit-frame-pointer), I get better code.

#include <stdio.h>
#include <stdint.h>

inline int64_t MUL64(int a, int b)
{

uint32_t resh, resl;
uint32_t au = a;
uint32_t bu = b;

__asm__ ("move.l %0, d5\n\t"
"move.l %1, d4\n\t"
"moveq #16, d3\n\t"
"move.l %0, d2\n\t"
"mulu %1, %0\n\t"
"lsr.l d3, d4\n\t"
"lsr.l d3, d5\n\t"
"mulu d4, d2\n\t"
"mulu d5, %1\n\t"
"mulu d5, d4\n\t"
"move.l d2, d5\n\t"
"lsr.l d3, d2\n\t"
"add.w %1, d5\n\t"
"addx.l d2, d4\n\t"
"lsl.l d3, d5\n\t"
"lsr.l d3, %1\n\t"
"add.l d5, %0\n\t"
"addx.l d4, %1\n\t"
: "=d"(resl), "=d"(resh)
: "0"(au), "1"(bu)
: "d2", "d3", "d4", "d5");

if (a < 0)
resh -= bu;
if (b < 0)
resh -= au;

return ((uint64_t)resh << 32) | resl;
}

GCC 4.4.0 -O3:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16128,-(sp)
move.l 28(sp),d0
move.l 32(sp),a0
move.l d0,d6
move.l a0,d1
#APP
;# 11 "mul645.c" 1
move.l d6, d5
move.l d1, d4
moveq #16, d3
move.l d6, d2
mulu d1, d6
lsr.l d3, d4
lsr.l d3, d5
mulu d4, d2
mulu d5, d1
mulu d5, d4
move.l d2, d5
lsr.l d3, d2
add.w d1, d5
addx.l d2, d4
lsl.l d3, d5
lsr.l d3, d1
add.l d5, d6
addx.l d4, d1

#NO_APP
tst.l d0
jlt L6
tst.l a0
jlt L7
L3:
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L7:
sub.l d0,d1
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L6:
sub.l a0,d1
tst.l a0
jge L3
jra L7

GCC 4.4.0 -O2:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16128,-(sp)
move.l 28(sp),d0
move.l 32(sp),a0
move.l d0,d6
move.l a0,d1
#APP
;# 11 "mul645.c" 1
move.l d6, d5
move.l d1, d4
moveq #16, d3
move.l d6, d2
mulu d1, d6
lsr.l d3, d4
lsr.l d3, d5
mulu d4, d2
mulu d5, d1
mulu d5, d4
move.l d2, d5
lsr.l d3, d2
add.w d1, d5
addx.l d2, d4
lsl.l d3, d5
lsr.l d3, d1
add.l d5, d6
addx.l d4, d1

#NO_APP
tst.l d0
jlt L6
tst.l a0
jlt L7
L3:
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L7:
sub.l d0,d1
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L6:
sub.l a0,d1
tst.l a0
jge L3
jra L7

GCC 4.4.0 -O1:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16176,-(sp)
move.l 40(sp),d0
move.l 36(sp),a2
move.l a2,d7
move.l d0,d6
#APP
;# 11 "mul645.c" 1
move.l d7, d5
move.l d6, d4
moveq #16, d3
move.l d7, d2
mulu d6, d7
lsr.l d3, d4
lsr.l d3, d5
mulu d4, d2
mulu d5, d6
mulu d5, d4
move.l d2, d5
lsr.l d3, d2
add.w d6, d5
addx.l d2, d4
lsl.l d3, d5
lsr.l d3, d6
add.l d5, d7
addx.l d4, d6

#NO_APP
tst.l a2
jge L2
sub.l d0,d6
L2:
tst.l d0
jge L3
sub.l a2,d6
L3:
move.l d6,d1
clr.l d2
or.l d7,d2
move.l d1,d0
move.l d2,d1
movem.l (sp)+,#3324
rts

GCC 4.4.0 -O0:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
lea (-16,sp),sp
movem.l #16128,-(sp)
move.l 44(sp),32(sp)
move.l 48(sp),36(sp)
move.l 32(sp),d1
move.l 36(sp),d0
#APP
;# 11 "mul645.c" 1
move.l d1, d5
move.l d0, d4
moveq #16, d3
move.l d1, d2
mulu d0, d1
lsr.l d3, d4
lsr.l d3, d5
mulu d4, d2
mulu d5, d0
mulu d5, d4
move.l d2, d5
lsr.l d3, d2
add.w d0, d5
addx.l d2, d4
lsl.l d3, d5
lsr.l d3, d0
add.l d5, d1
addx.l d4, d0

#NO_APP
move.l d1,28(sp)
move.l d0,24(sp)
tst.l 44(sp)
jge L2
move.l 36(sp),d0
sub.l d0,24(sp)
L2:
tst.l 48(sp)
jge L3
move.l 32(sp),d2
sub.l d2,24(sp)
L3:
move.l 24(sp),d7
clr.l d6
move.l d7,d0
clr.l d1
move.l 28(sp),a1
lea 0.w,a0
move.l a0,d2
move.l a1,d3
or.l d2,d0
or.l d3,d1
movem.l (sp)+,#252
lea (16,sp),sp
rts 

Regards

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: m68k - GCC 4.4.0 generates not so good code from asm   inline
  2009-07-29 16:57 ` ami_stuff
@ 2009-07-29 17:12   ` Bernd Roesch
  0 siblings, 0 replies; 3+ messages in thread
From: Bernd Roesch @ 2009-07-29 17:12 UTC (permalink / raw)
  To: ami_stuff; +Cc: gcc

Hello 

On 29.07.09, you wrote:

if you have a account you can report that as a Bug.
gcc4 have the advantage its possible to switch in source optimizer on or
off, but how it work, i dont know.

> When I use -O1 with GCC 4.4.0 (-m68060 -fomit-frame-pointer), I get better
> code.
> 
> #include <stdio.h>
> #include <stdint.h>
> 
> inline int64_t MUL64(int a, int b)
> {
> 
> uint32_t resh, resl;
> uint32_t au = a;
> uint32_t bu = b;
> 
> __asm__ ("move.l %0, d5\n\t"
> "move.l %1, d4\n\t"
> "moveq #16, d3\n\t"
> "move.l %0, d2\n\t"
> "mulu %1, %0\n\t"
> "lsr.l d3, d4\n\t"
> "lsr.l d3, d5\n\t"
> "mulu d4, d2\n\t"
> "mulu d5, %1\n\t"
> "mulu d5, d4\n\t"
> "move.l d2, d5\n\t"
> "lsr.l d3, d2\n\t"
> "add.w %1, d5\n\t"
> "addx.l d2, d4\n\t"
> "lsl.l d3, d5\n\t"
> "lsr.l d3, %1\n\t"
> "add.l d5, %0\n\t"
> "addx.l d4, %1\n\t"
> : "=d"(resl), "=d"(resh)
> : "0"(au), "1"(bu)
> : "d2", "d3", "d4", "d5");
> 
> if (a < 0)
> resh -= bu;
> if (b < 0)
> resh -= au;
> 
> return ((uint64_t)resh << 32) | resl;
> }
> 
> GCC 4.4.0 -O3:
> 
> #NO_APP
> .text
> .even
> .globl _MUL64
> _MUL64:
> movem.l #16128,-(sp)
> move.l 28(sp),d0
> move.l 32(sp),a0
> move.l d0,d6
> move.l a0,d1
> #APP
> ;# 11 "mul645.c" 1
> move.l d6, d5
> move.l d1, d4
> moveq #16, d3
> move.l d6, d2
> mulu d1, d6
> lsr.l d3, d4
> lsr.l d3, d5
> mulu d4, d2
> mulu d5, d1
> mulu d5, d4
> move.l d2, d5
> lsr.l d3, d2
> add.w d1, d5
> addx.l d2, d4
> lsl.l d3, d5
> lsr.l d3, d1
> add.l d5, d6
> addx.l d4, d1
> 
> #NO_APP
> tst.l d0
> jlt L6
> tst.l a0
> jlt L7
> L3:
> move.l d1,d2
> clr.l d3
> move.l d2,d0
> move.l d3,d1
> or.l d6,d1
> move.l d0,d6
> move.l d1,d7
> move.l d7,d1
> movem.l (sp)+,#252
> rts
> L7:
> sub.l d0,d1
> move.l d1,d2
> clr.l d3
> move.l d2,d0
> move.l d3,d1
> or.l d6,d1
> move.l d0,d6
> move.l d1,d7
> move.l d7,d1
> movem.l (sp)+,#252
> rts
> L6:
> sub.l a0,d1
> tst.l a0
> jge L3
> jra L7
> 
> GCC 4.4.0 -O2:
> 
> #NO_APP
> .text
> .even
> .globl _MUL64
> _MUL64:
> movem.l #16128,-(sp)
> move.l 28(sp),d0
> move.l 32(sp),a0
> move.l d0,d6
> move.l a0,d1
> #APP
> ;# 11 "mul645.c" 1
> move.l d6, d5
> move.l d1, d4
> moveq #16, d3
> move.l d6, d2
> mulu d1, d6
> lsr.l d3, d4
> lsr.l d3, d5
> mulu d4, d2
> mulu d5, d1
> mulu d5, d4
> move.l d2, d5
> lsr.l d3, d2
> add.w d1, d5
> addx.l d2, d4
> lsl.l d3, d5
> lsr.l d3, d1
> add.l d5, d6
> addx.l d4, d1
> 
> #NO_APP
> tst.l d0
> jlt L6
> tst.l a0
> jlt L7
> L3:
> move.l d1,d2
> clr.l d3
> move.l d2,d0
> move.l d3,d1
> or.l d6,d1
> move.l d0,d6
> move.l d1,d7
> move.l d7,d1
> movem.l (sp)+,#252
> rts
> L7:
> sub.l d0,d1
> move.l d1,d2
> clr.l d3
> move.l d2,d0
> move.l d3,d1
> or.l d6,d1
> move.l d0,d6
> move.l d1,d7
> move.l d7,d1
> movem.l (sp)+,#252
> rts
> L6:
> sub.l a0,d1
> tst.l a0
> jge L3
> jra L7
> 
> GCC 4.4.0 -O1:
> 
> #NO_APP
> .text
> .even
> .globl _MUL64
> _MUL64:
> movem.l #16176,-(sp)
> move.l 40(sp),d0
> move.l 36(sp),a2
> move.l a2,d7
> move.l d0,d6
> #APP
> ;# 11 "mul645.c" 1
> move.l d7, d5
> move.l d6, d4
> moveq #16, d3
> move.l d7, d2
> mulu d6, d7
> lsr.l d3, d4
> lsr.l d3, d5
> mulu d4, d2
> mulu d5, d6
> mulu d5, d4
> move.l d2, d5
> lsr.l d3, d2
> add.w d6, d5
> addx.l d2, d4
> lsl.l d3, d5
> lsr.l d3, d6
> add.l d5, d7
> addx.l d4, d6
> 
> #NO_APP
> tst.l a2
> jge L2
> sub.l d0,d6
> L2:
> tst.l d0
> jge L3
> sub.l a2,d6
> L3:
> move.l d6,d1
> clr.l d2
> or.l d7,d2
> move.l d1,d0
> move.l d2,d1
> movem.l (sp)+,#3324
> rts
> 
> GCC 4.4.0 -O0:
> 
> #NO_APP
> .text
> .even
> .globl _MUL64
> _MUL64:
> lea (-16,sp),sp
> movem.l #16128,-(sp)
> move.l 44(sp),32(sp)
> move.l 48(sp),36(sp)
> move.l 32(sp),d1
> move.l 36(sp),d0
> #APP
> ;# 11 "mul645.c" 1
> move.l d1, d5
> move.l d0, d4
> moveq #16, d3
> move.l d1, d2
> mulu d0, d1
> lsr.l d3, d4
> lsr.l d3, d5
> mulu d4, d2
> mulu d5, d0
> mulu d5, d4
> move.l d2, d5
> lsr.l d3, d2
> add.w d0, d5
> addx.l d2, d4
> lsl.l d3, d5
> lsr.l d3, d0
> add.l d5, d1
> addx.l d4, d0
> 
> #NO_APP
> move.l d1,28(sp)
> move.l d0,24(sp)
> tst.l 44(sp)
> jge L2
> move.l 36(sp),d0
> sub.l d0,24(sp)
> L2:
> tst.l 48(sp)
> jge L3
> move.l 32(sp),d2
> sub.l d2,24(sp)
> L3:
> move.l 24(sp),d7
> clr.l d6
> move.l d7,d0
> clr.l d1
> move.l 28(sp),a1
> lea 0.w,a0
> move.l a0,d2
> move.l a1,d3
> or.l d2,d0
> or.l d3,d1
> movem.l (sp)+,#252
> lea (16,sp),sp
> rts 
> 
> Regards
> 
Regards

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2009-07-29 17:12 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-07-29 11:41 m68k - GCC 4.4.0 generates not so good code from asm inline ami_stuff
2009-07-29 16:57 ` ami_stuff
2009-07-29 17:12   ` Bernd Roesch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).