* EPIC optimiser failures (i386)
@ 2023-01-09 11:49 Stefan Kanthak
0 siblings, 0 replies; only message in thread
From: Stefan Kanthak @ 2023-01-09 11:49 UTC (permalink / raw)
To: gcc
Hi,
compile the following routine for the i386 processor,
with optimisation:
--- double.c ---
// IEEE-754 binary64 double-precision floating point
// binary64 != ±0.0 -> 0
// binary64 == +0.0 -> +1
// binary64 == -0.0 -> -1
int plusminus0(unsigned long long binary64)
{
if (binary64 != -binary64) // neither +0.0 nor -0.0
return 0;
if (binary64 == 0)
return 1;
return -1;
}
--- EOF ---
GCC 12.2 gcc -m32 -O2 double.c
# https://godbolt.org/z/17as1M1xM
plusminus0(unsigned long long):
push esi
push ebx
mov ecx, DWORD PTR [esp+12]
mov ebx, DWORD PTR [esp+16]
mov eax, ecx
neg eax
mov edx, ebx
adc edx, 0
xor eax, ecx
neg edx
xor edx, ebx
or eax, edx
jne .L5
or ecx, ebx
pop ebx
cmp ecx, 1
sbb esi, esi
and esi, 2
sub esi, 1
mov eax, esi
pop esi
ret
.L5:
xor esi, esi
pop ebx
mov eax, esi
pop esi
ret
OUCH: these 27 instructions in 56 bytes are as BAD^WHORRIBLE as code
could get!
EVERY optimising^Wcompiler writer should be aware that
if (binary64 == -binary64)
is just a shorthand for
if (binary64 == 0 - binary64)
and thus equivalent to
if (binary64 + binary64 == 0)
which SHOULD lead to the following (optionally branch-free) code:
mov ecx, dword ptr [esp+4]
mov edx, dword ptr [esp+8] # edx:ecx = binary64
add ecx, ecx
adc edx, edx
sbb eax, eax # eax = (binary64 < 0) ? -1 : 0
.ifnotdef BRANCHFREE
or ecx, edx
jz .L0 # binary64 == -binary64?
stc # CF = 1
adc eax, eax # eax = (binary64 < 0) ? -1 : 1
.L0:
.else
stc # CF = 1
adc eax, eax # eax = (binary64 < 0) ? -1 : 1
or ecx, edx # ecx = (binary64 == -binary64) ? 0 : *
neg ecx # CF = (binary64 != -binary64)
sbb ecx, ecx # ecx = (binary64 != -binary64) ? -1 : 0
not ecx # ecx = (binary64 == -binary64) ? -1 : 0
and eax, ecx
.endif
ret
Either 10 instructions in 22 bytes or 13 instructions in 28 bytes,
i.e. less than half the instructions and bytes!
Since the lower half of the binary64 only needs to be tested against 0,
a TRUE optimising compiler would but come up with the following code:
mov eax, dword ptr [esp+8] # upper half of binary64
cdq # edx = (binary64 < 0) ? -1 : 0
stc # CF = 1
adc edx, edx # edx = (binary64 < 0) ? -1 : 1
add eax, eax
or eax, dword ptr [esp+4]
neg eax # CF = (binary64 != -binary64)
sbb eax, eax # eax = (binary64 != -binary64) ? -1 : 0
not eax # eax = (binary64 == -binary64) ? -1 : 0
and eax, edx
ret
11 instructions in 23 bytes.
--- single.c ---
// IEEE-754 binary32 single-precision floating point
int plusminus0(unsigned long binary32)
{
if (binary32 != -binary32) // neither +0.0 nor -0.0
return 0;
if (binary32 == 0)
return 1;
return -1;
}
--- EOF ---
GCC 12.2 gcc -m32 -O2 single.c
# https://godbolt.org/z/djT748e81
plusminus0(unsigned int):
mov edx, DWORD PTR [esp+4]
xor eax, eax
mov ecx, edx
neg ecx
cmp ecx, edx
jne .L1
cmp ecx, 1
sbb eax, eax
and eax, 2
sub eax, 1
.L1:
ret
OOPS (11 instructions in 26 bytes)!
An optimising compiler SHOULD but generate 8 instructions in 16 bytes:
xor eax, eax
mov ecx, DWORD PTR [esp+4]
add ecx, ecx
jnz .L1 # binary32 != -binary32?
sbb eax, eax # eax = (binary32 < 0) ? -1 : 0
stc # CF = 1
adc eax, eax # eax = (binary32 < 0) ? -1 : 1
.L1:
ret
A TRUE optimising compiler would butgenerate the following branch-free
code, using 7 or 8 instructions in 19 or 18 bytes:
.if 0
mov eax, DWORD PTR [esp+4]
neg eax # OF = (binary32 == -0.0),
# ZF = (binary32 == +0.0)
.else
xor eax, eax
sub eax, DWORD PTR [esp+4]
.endif
seto ah
setz al
sub al, ah # al = ZF - OF
.if 0
cbw
cwde
.else
movsx eax, al
.endif
ret
Stefan Kanthak
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-01-09 11:50 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-09 11:49 EPIC optimiser failures (i386) Stefan Kanthak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).