* B^HDEAD code generation (i386)
@ 2023-01-09 11:31 Stefan Kanthak
0 siblings, 0 replies; only message in thread
From: Stefan Kanthak @ 2023-01-09 11:31 UTC (permalink / raw)
To: gcc
Hi,
compile the following 64-bit GCD routine for the i386 processor,
with full optimization and the preprocessor macro CTZ defined:
--- gcddi3.c ---
// Stein's algorithm: greatest common divisor
unsigned long long __gcddi3(unsigned long long p, unsigned long long q)
{
unsigned r, s = 0;
unsigned long long t;
if (p == 0)
return q;
if (q == 0)
return p;
if (p == q)
return p;
#ifndef CTZ
while (((p | q) & 1) == 0)
p >>= 1, q >>= 1, s++;
while ((p & 1) == 0)
p >>= 1;
do {
while ((q & 1) == 0)
q >>= 1;
#elif CTZ != 32
s = __builtin_ctzll(p | q), p >>= s, q >>= s;
r = __builtin_ctzll(p), p >>= r;
do {
r = __builtin_ctzll(q), q >>= r;
#else
if (((unsigned long) p | (unsigned long) q) == 0)
p >>= 32, q >>= 32, s = 32;
r = __builtin_ctzl((unsigned long) p | (unsigned long) q), p >>= r, q >>= r, s += r;
if ((unsigned long) p == 0)
p >>= 32;
r = __builtin_ctzl(p), p >>= r;
do
{
if ((unsigned long) q == 0)
q >>= 32;
r = __builtin_ctzl(q), q >>= r;
#endif
if (p < q)
t = q, q = p, p = t;
} while (q -= p);
return p << s;
}
--- EOF ---
GCC 12.2: gcc -DCTZ -m32 -mno-sse -O3 gcddi3.c
# https://godbolt.org/z/c7cao8M57
__gcddi3(unsigned long long, unsigned long long):
push ebp
push edi
push esi
push ebx
sub esp, 28
mov ebx, DWORD PTR [esp+52]
mov ecx, DWORD PTR [esp+48]
mov esi, DWORD PTR [esp+56]
mov edi, DWORD PTR [esp+60]
mov ebp, ebx
mov DWORD PTR [esp+8], ecx
or ebp, ecx
mov DWORD PTR [esp+12], ebx
mov eax, esi
mov edx, edi
je .L1
mov eax, ecx
mov edx, ebx
xor eax, esi
xor edx, edi
or eax, edx
je .L6
mov eax, esi
or eax, edi
je .L6
mov eax, ecx
mov edx, ebx
sub esp, 8
xor ebp, ebp
or edx, edi
or eax, esi
push edx #
push eax #
call __ctzdi2 # Oops: shouldn't this be inlined with -O3?!
pop edx #
pop ecx #
mov ebx, eax
mov edx, DWORD PTR [esp+20]
mov eax, DWORD PTR [esp+16]
mov ecx, ebx
shrd eax, edx, cl
shr edx, cl
test bl, 32
cmovne eax, edx
cmovne edx, ebp
shrd esi, edi, cl
xor ebp, ebp
shr edi, cl
test bl, 32
mov DWORD PTR [esp+16], eax
cmovne esi, edi
cmovne edi, ebp
mov DWORD PTR [esp+20], edx
push edx #
push eax #
call __ctzdi2 # Oops: shouldn't this be inlined with -O3?!
mov edx, DWORD PTR [esp+28]
add esp, 16
mov ebp, eax
mov eax, DWORD PTR [esp+8]
mov ecx, ebp
xor ebp, ebp
shrd eax, edx, cl
shr edx, cl
and ecx, 32
cmovne eax, edx
cmovne edx, ebp
mov DWORD PTR [esp+8], eax
mov DWORD PTR [esp+12], edx
.L4:
sub esp, 8
xor ebp, ebp
push edi #
push esi #
call __ctzdi2 # Oops: shouldn't this be inlined with -O3?!
add esp, 16
mov edx, DWORD PTR [esp+12]
mov ecx, eax
shrd esi, edi, cl
shr edi, cl
test al, 32
mov eax, DWORD PTR [esp+8]
cmovne esi, edi
cmovne edi, ebp
mov ecx, edx
cmp eax, esi
sbb ecx, edi
jnc .L3
mov DWORD PTR [esp+8], esi
mov esi, eax
mov DWORD PTR [esp+12], edi
mov edi, edx
.L3:
sub esi, DWORD PTR [esp+8]
sbb edi, DWORD PTR [esp+12]
mov eax, edi
or eax, esi
jne .L4
mov eax, DWORD PTR [esp+8]
mov edx, DWORD PTR [esp+12]
mov ecx, ebx
xor ebx, ebx
shld edx, eax, cl
sal eax, cl
and ecx, 32
cmovne edx, eax
cmovne eax, ebx
.L1:
add esp, 28
pop ebx
pop esi
pop edi
pop ebp
ret
.L6:
mov eax, DWORD PTR [esp+8]
mov edx, DWORD PTR [esp+12]
add esp, 28
pop ebx
pop esi
pop edi
pop ebp
ret
Compile it again, now with the preprocessor macro CTZ=32 defined
to avoid/inline the calls of __ctzdi2:
GCC 12.2: gcc -DCTZ=32 -m32 -mno-sse -O3 gcddi3.c
# https://godbolt.org/z/deo65387b
__gcddi3(unsigned long long, unsigned long long):
push ebp
push edi
push esi
push ebx
mov edi, DWORD PTR [esp+24]
mov esi, DWORD PTR [esp+20]
mov eax, DWORD PTR [esp+28]
mov edx, DWORD PTR [esp+32]
mov ebp, edi
or ebp, esi
mov ecx, eax
mov ebx, edx
je .L1
mov ecx, esi
mov ebx, edi
xor ecx, eax
xor ebx, edx
or ecx, ebx
je .L9
mov ebx, eax
or ebx, edx
je .L9
mov ecx, esi
xor ebx, ebx
or ecx, eax
jne .L3
mov esi, edi
mov eax, edx
xor edi, edi
xor edx, edx
mov ecx, esi
mov ebx, 32
or ecx, eax
.L3:
rep bsf ecx, ecx
xor ebp, ebp # OUCH: BSF and TZCNT return at most 31,
shrd esi, edi, cl
shr edi, cl
test cl, 32 # so this is dead code!
cmovne esi, edi #
cmovne edi, ebp #
shrd eax, edx, cl
xor ebp, ebp # OUCH: EBP is already 0 here,
shr edx, cl
test cl, 32 # and BSF and TZCNT return at most 31,
cmovne eax, edx # so this is dead code!
cmovne edx, ebp #
lea ebp, [ebx+ecx]
mov ecx, esi # Oops: superfluous
test esi, esi
jne .L4
mov esi, edi
xor edi, edi
mov ecx, esi # Oops: superfluous
.L4:
rep bsf ecx, ecx # Oops: should be BSF ECX, ESI
xor ebx, ebx # OUCH: BSF and TZCNT return at most 31,
shrd esi, edi, cl
shr edi, cl
and ecx, 32 # and there's no need to modify ECX here,
cmovne esi, edi # so this is dead code!
cmovne edi, ebx #
.L7:
mov ecx, eax # Oops: superfluous
test eax, eax
jne .L5
mov eax, edx
xor edx, edx
mov ecx, eax # Oops: superfluous
.L5:
rep bsf ecx, ecx # Oops: should be BSF ECX, EAX
xor ebx, ebx # OUCH: BSF and TZCNT return at most 31,
shrd eax, edx, cl
shr edx, cl
test cl, 32 # so this is dead code!
cmovne eax, edx #
cmovne edx, ebx #
mov ebx, edi
cmp esi, eax
sbb ebx, edx
jnc .L6
mov ecx, esi
mov ebx, edi
mov esi, eax
mov edi, edx
mov eax, ecx
mov edx, ebx
.L6:
sub eax, esi
sbb edx, edi
mov ebx, edx
or ebx, eax
jne .L7
mov ecx, ebp
xor eax, eax
shld edi, esi, cl
sal esi, cl
test cl, 32
cmovne edi, esi
cmovne esi, eax
mov ebx, edi
mov ecx, esi
.L1:
mov edx, ebx
mov eax, ecx
pop ebx
pop esi
pop edi
pop ebp
ret
.L9:
mov ebx, edi # Ouch: GCC likes to play shell games!
mov ecx, esi #
mov edx, ebx #
mov eax, ecx #
pop ebx
pop esi
pop edi
pop ebp
ret
22 superfluous instructions out of the total 104 instructions
NOT AMUSED
Stefan Kanthak
PS: <https://skanthak.homepage.t-online.de/gcc.html#case27-i386-s>
shows properly written assembly code.
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-01-09 11:38 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-09 11:31 B^HDEAD code generation (i386) Stefan Kanthak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).