* Superfluous branches due to insufficient flow analysis
@ 2021-08-13 18:58 Stefan Kanthak
2021-08-13 19:22 ` Gabriel Ravier
0 siblings, 1 reply; 4+ messages in thread
From: Stefan Kanthak @ 2021-08-13 18:58 UTC (permalink / raw)
To: GCC Development
Hi,
compile the following naive implementation of nextafter() for AMD64:
JFTR: ignore the aliasing casts, they don't matter here!
$ cat repro.c
double nextafter(double from, double to)
{
if (to != to)
return to; // to is NAN
if (from != from)
return from; // from is NAN
if (from == to) // neither from nor to can be NAN here!
return to;
if (from == 0.0) // dito!
return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;
unsigned long long ull = *(unsigned long long *) &from;
if ((from < to) == (from < 0.0))
ull--;
else
ull++;
return *(double *) &ull;
}
$ gcc -m64 -o- -O3 -S repro.c
...
nextafter:
ucomisd %xmm1, %xmm1 // sets PF for unordered result, i.e. when at
jp .L10 // least one operand is NAN
ucomisd %xmm0, %xmm0 // same here
jp .L1
ucomisd %xmm0, %xmm1
jnp .L14 // OUCH: PF can't be set here!
// OUCH: and if it were, it's MORE LIKELY to be
// clear, so this branch would be taken
// ... against the branch prediction
.L11:
pxor %xmm2, %xmm2 // OUCH: switching from FP SSE to integer SSE and
// vice versa incurs a penalty of 1 cycle
// on quite a lot Intel Core processors!
// Better use XORPD instead (which is even 1 byte
// shorter)!
ucomisd %xmm2, %xmm0
jnp .L15 // OUCH: there's still no need to check PF here!
.L4:
comisd %xmm0, %xmm1
movq %xmm0, %rdx
leaq -1(%rdx), %rax
seta %r8b
comisd %xmm0, %xmm2
seta %cl
addq $1, %rdx
cmpb %cl, %r8b
cmovne %rdx, %rax
movq %rax, %xmm0
.L1:
ret
.L14:
jne .L11
.L10:
movapd %xmm1, %xmm0
ret
.L15:
jne .L4
movabsq $-9223372036854775808, %rdx
movq %xmm1, %rax
andq %rdx, %rax
orq $1, %rax
movq %rax, %xmm0
ret
Stefan
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: Superfluous branches due to insufficient flow analysis
2021-08-13 18:58 Superfluous branches due to insufficient flow analysis Stefan Kanthak
@ 2021-08-13 19:22 ` Gabriel Ravier
2021-08-14 14:22 ` Stefan Kanthak
2021-08-14 17:10 ` 3rd deficiency (was: Superfluous branches due to insufficient flow analysis) Stefan Kanthak
0 siblings, 2 replies; 4+ messages in thread
From: Gabriel Ravier @ 2021-08-13 19:22 UTC (permalink / raw)
To: Stefan Kanthak, GCC Development
On 8/13/21 8:58 PM, Stefan Kanthak wrote:
> Hi,
>
> compile the following naive implementation of nextafter() for AMD64:
>
> JFTR: ignore the aliasing casts, they don't matter here!
>
> $ cat repro.c
> double nextafter(double from, double to)
> {
> if (to != to)
> return to; // to is NAN
>
> if (from != from)
> return from; // from is NAN
>
> if (from == to) // neither from nor to can be NAN here!
> return to;
>
> if (from == 0.0) // dito!
> return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;
>
> unsigned long long ull = *(unsigned long long *) &from;
>
> if ((from < to) == (from < 0.0))
> ull--;
> else
> ull++;
>
> return *(double *) &ull;
> }
> $ gcc -m64 -o- -O3 -S repro.c
> ...
> nextafter:
> ucomisd %xmm1, %xmm1 // sets PF for unordered result, i.e. when at
> jp .L10 // least one operand is NAN
> ucomisd %xmm0, %xmm0 // same here
> jp .L1
> ucomisd %xmm0, %xmm1
> jnp .L14 // OUCH: PF can't be set here!
> // OUCH: and if it were, it's MORE LIKELY to be
> // clear, so this branch would be taken
> // ... against the branch prediction
> .L11:
> pxor %xmm2, %xmm2 // OUCH: switching from FP SSE to integer SSE and
> // vice versa incurs a penalty of 1 cycle
> // on quite a lot Intel Core processors!
> // Better use XORPD instead (which is even 1 byte
> // shorter)!
> ucomisd %xmm2, %xmm0
> jnp .L15 // OUCH: there's still no need to check PF here!
> .L4:
> comisd %xmm0, %xmm1
> movq %xmm0, %rdx
> leaq -1(%rdx), %rax
> seta %r8b
> comisd %xmm0, %xmm2
> seta %cl
> addq $1, %rdx
> cmpb %cl, %r8b
> cmovne %rdx, %rax
> movq %rax, %xmm0
> .L1:
> ret
> .L14:
> jne .L11
> .L10:
> movapd %xmm1, %xmm0
> ret
> .L15:
> jne .L4
> movabsq $-9223372036854775808, %rdx
> movq %xmm1, %rax
> andq %rdx, %rax
> orq $1, %rax
> movq %rax, %xmm0
> ret
>
>
> Stefan
Shouldn't this kind of stuff go to the Bugzilla ?
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: Superfluous branches due to insufficient flow analysis
2021-08-13 19:22 ` Gabriel Ravier
@ 2021-08-14 14:22 ` Stefan Kanthak
2021-08-14 17:10 ` 3rd deficiency (was: Superfluous branches due to insufficient flow analysis) Stefan Kanthak
1 sibling, 0 replies; 4+ messages in thread
From: Stefan Kanthak @ 2021-08-14 14:22 UTC (permalink / raw)
To: GCC Development, Gabriel Ravier
"Gabriel Ravier" <gabravier@gmail.com> wrote:
Please don't FULL QUOTE!
> On 8/13/21 8:58 PM, Stefan Kanthak wrote:
>> Hi,
>>
>> compile the following naive implementation of nextafter() for AMD64:
>>
>> JFTR: ignore the aliasing casts, they don't matter here!
>>
>> $ cat repro.c
[...]
> Shouldn't this kind of stuff go to the Bugzilla ?
I don't mind if you or someone else take it there.
If you do so, here's one for the road:
$ cat alternate.c
double nextafter(double from, double to)
{
if (from == to)
return to;
if ((from != from) || (to != to))
return from + to;
if (from == 0.0)
return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;
unsigned long long ull = *(unsigned long long *) &from;
if ((from > to) == (from > 0.0))
ull--;
else
ull++;
return *(double *) &ull;
}
$ gcc -m64 -o- -O3 -S alternate.c
nextafter:
ucomisd %xmm0, %xmm1 # sets PF if comparands are unordered,
# i.e. at least one is a NaN
jnp .L20 # OUCH: this branch is unlikely taken!
.L13: # reached only if at least one
# argument is a NaN!
ucomisd %xmm0, %xmm0 # OUCH: SUPERFLUOUS!
jnp .L21 # OUCH: SUPERFLUOUS!
.L14:
addsd %xmm1, %xmm0
ret
.L21: # reached only if at least one
# argument is a NaN!
ucomisd %xmm1, %xmm1 # OUCH: SUPERFLUOUS!
jp .L14 # OUCH: SUPERFLUOUS!
pxor %xmm2, %xmm2
ucomisd %xmm2, %xmm0
jnp .L22 # OUCH: SUPERFLUOUS!
# xmm0 can't be NaN here!
# OUCH: and if it were, this branch
# would unlikely be taken!
.L8:
comisd %xmm0, %xmm1
movq %xmm0, %rdx
leaq -1(%rdx), %rax
seta %r8b
comisd %xmm0, %xmm2
seta %cl
addq $1, %rdx
cmpb %cl, %r8b
cmovne %rdx, %rax
movq %rax, %xmm0
ret
.L20:
jne .L13
movapd %xmm1, %xmm0
ret
.L22:
jne .L8
movabsq $-9223372036854775808, %rdx
movq %xmm1, %rax
andq %rdx, %rax
orq $1, %rax
movq %rax, %xmm0
ret
Stefan
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: 3rd deficiency (was: Superfluous branches due to insufficient flow analysis)
2021-08-13 19:22 ` Gabriel Ravier
2021-08-14 14:22 ` Stefan Kanthak
@ 2021-08-14 17:10 ` Stefan Kanthak
1 sibling, 0 replies; 4+ messages in thread
From: Stefan Kanthak @ 2021-08-14 17:10 UTC (permalink / raw)
To: GCC Development, Gabriel Ravier
Gabriel Ravier <gabravier@gmail.com> wrote:
Independent from the defunct flow analysis in the presence of NaNs, my
example demonstrates another minor deficiency: know thy instruction set!
See the comments in the assembly below.
> On 8/13/21 8:58 PM, Stefan Kanthak wrote:
>> Hi,
>>
>> compile the following naive implementation of nextafter() for AMD64:
>>
>> JFTR: ignore the aliasing casts, they don't matter here!
>>
>> $ cat repro.c
>> double nextafter(double from, double to)
>> {
>> if (to != to)
>> return to; // to is NAN
>>
>> if (from != from)
>> return from; // from is NAN
>>
>> if (from == to) // neither from nor to can be NAN here!
>> return to;
>>
>> if (from == 0.0) // dito!
>> return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;
>>
>> unsigned long long ull = *(unsigned long long *) &from;
>>
>> if ((from < to) == (from < 0.0))
>> ull--;
>> else
>> ull++;
>>
>> return *(double *) &ull;
>> }
>> $ gcc -m64 -o- -O3 -S repro.c
>> ...
>> nextafter:
[...]
>> .L4:
>> comisd %xmm0, %xmm1 # sets CF if first comparand < second comparand
>> movq %xmm0, %rdx
>> leaq -1(%rdx), %rax # superfluous
>> seta %r8b # sbb %rcx, %rcx
>> comisd %xmm0, %xmm2
>> seta %cl # sbb %rax, %rax
>> addq $1, %rdx # xor %rcx, %rax
>> cmpb %cl, %r8b # or $1, %rax
>> cmovne %rdx, %rax # add %rdx, %rax
>> movq %rax, %xmm0
Stefan
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2021-08-14 17:12 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-13 18:58 Superfluous branches due to insufficient flow analysis Stefan Kanthak
2021-08-13 19:22 ` Gabriel Ravier
2021-08-14 14:22 ` Stefan Kanthak
2021-08-14 17:10 ` 3rd deficiency (was: Superfluous branches due to insufficient flow analysis) Stefan Kanthak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).