public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* Superfluous branches due to insufficient flow analysis
@ 2021-08-13 18:58 Stefan Kanthak
  2021-08-13 19:22 ` Gabriel Ravier
  0 siblings, 1 reply; 4+ messages in thread
From: Stefan Kanthak @ 2021-08-13 18:58 UTC (permalink / raw)
  To: GCC Development

Hi,

compile the following naive implementation of nextafter() for AMD64:

JFTR: ignore the aliasing casts, they don't matter here!

$ cat repro.c
double nextafter(double from, double to)
{
    if (to != to)
        return to;        // to is NAN

    if (from != from)
        return from;      // from is NAN

    if (from == to)       // neither from nor to can be NAN here!
        return to;

    if (from == 0.0)      // dito!
        return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;

    unsigned long long ull = *(unsigned long long *) &from;

    if ((from < to) == (from < 0.0))
        ull--;
    else
        ull++;

    return *(double *) &ull;
}
$ gcc -m64 -o- -O3 -S repro.c
...
nextafter:
        ucomisd %xmm1, %xmm1    // sets PF for unordered result, i.e. when at
        jp      .L10            //  least one operand is NAN
        ucomisd %xmm0, %xmm0    // same here
        jp      .L1
        ucomisd %xmm0, %xmm1
        jnp     .L14            // OUCH: PF can't be set here!
                                // OUCH: and if it were, it's MORE LIKELY to be
                                //        clear, so this branch would be taken
                                //         ... against the branch prediction 
.L11:
        pxor    %xmm2, %xmm2    // OUCH: switching from FP SSE to integer SSE and
                                //        vice versa incurs a penalty of 1 cycle
                                //         on quite a lot Intel Core processors!
                                // Better use XORPD instead (which is even 1 byte
                                //  shorter)!
        ucomisd %xmm2, %xmm0
        jnp     .L15            // OUCH: there's still no need to check PF here! 
.L4:
        comisd  %xmm0, %xmm1
        movq    %xmm0, %rdx
        leaq    -1(%rdx), %rax
        seta    %r8b
        comisd  %xmm0, %xmm2
        seta    %cl
        addq    $1, %rdx
        cmpb    %cl, %r8b
        cmovne  %rdx, %rax
        movq    %rax, %xmm0
.L1:
        ret
.L14:
        jne     .L11
.L10:
        movapd  %xmm1, %xmm0
        ret
.L15:
        jne     .L4
        movabsq $-9223372036854775808, %rdx
        movq    %xmm1, %rax
        andq    %rdx, %rax
        orq     $1, %rax
        movq    %rax, %xmm0
        ret


Stefan

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Superfluous branches due to insufficient flow analysis
  2021-08-13 18:58 Superfluous branches due to insufficient flow analysis Stefan Kanthak
@ 2021-08-13 19:22 ` Gabriel Ravier
  2021-08-14 14:22   ` Stefan Kanthak
  2021-08-14 17:10   ` 3rd deficiency (was: Superfluous branches due to insufficient flow analysis) Stefan Kanthak
  0 siblings, 2 replies; 4+ messages in thread
From: Gabriel Ravier @ 2021-08-13 19:22 UTC (permalink / raw)
  To: Stefan Kanthak, GCC Development

On 8/13/21 8:58 PM, Stefan Kanthak wrote:
> Hi,
>
> compile the following naive implementation of nextafter() for AMD64:
>
> JFTR: ignore the aliasing casts, they don't matter here!
>
> $ cat repro.c
> double nextafter(double from, double to)
> {
>      if (to != to)
>          return to;        // to is NAN
>
>      if (from != from)
>          return from;      // from is NAN
>
>      if (from == to)       // neither from nor to can be NAN here!
>          return to;
>
>      if (from == 0.0)      // dito!
>          return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;
>
>      unsigned long long ull = *(unsigned long long *) &from;
>
>      if ((from < to) == (from < 0.0))
>          ull--;
>      else
>          ull++;
>
>      return *(double *) &ull;
> }
> $ gcc -m64 -o- -O3 -S repro.c
> ...
> nextafter:
>          ucomisd %xmm1, %xmm1    // sets PF for unordered result, i.e. when at
>          jp      .L10            //  least one operand is NAN
>          ucomisd %xmm0, %xmm0    // same here
>          jp      .L1
>          ucomisd %xmm0, %xmm1
>          jnp     .L14            // OUCH: PF can't be set here!
>                                  // OUCH: and if it were, it's MORE LIKELY to be
>                                  //        clear, so this branch would be taken
>                                  //         ... against the branch prediction
> .L11:
>          pxor    %xmm2, %xmm2    // OUCH: switching from FP SSE to integer SSE and
>                                  //        vice versa incurs a penalty of 1 cycle
>                                  //         on quite a lot Intel Core processors!
>                                  // Better use XORPD instead (which is even 1 byte
>                                  //  shorter)!
>          ucomisd %xmm2, %xmm0
>          jnp     .L15            // OUCH: there's still no need to check PF here!
> .L4:
>          comisd  %xmm0, %xmm1
>          movq    %xmm0, %rdx
>          leaq    -1(%rdx), %rax
>          seta    %r8b
>          comisd  %xmm0, %xmm2
>          seta    %cl
>          addq    $1, %rdx
>          cmpb    %cl, %r8b
>          cmovne  %rdx, %rax
>          movq    %rax, %xmm0
> .L1:
>          ret
> .L14:
>          jne     .L11
> .L10:
>          movapd  %xmm1, %xmm0
>          ret
> .L15:
>          jne     .L4
>          movabsq $-9223372036854775808, %rdx
>          movq    %xmm1, %rax
>          andq    %rdx, %rax
>          orq     $1, %rax
>          movq    %rax, %xmm0
>          ret
>
>
> Stefan
Shouldn't this kind of stuff go to the Bugzilla ?

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Superfluous branches due to insufficient flow analysis
  2021-08-13 19:22 ` Gabriel Ravier
@ 2021-08-14 14:22   ` Stefan Kanthak
  2021-08-14 17:10   ` 3rd deficiency (was: Superfluous branches due to insufficient flow analysis) Stefan Kanthak
  1 sibling, 0 replies; 4+ messages in thread
From: Stefan Kanthak @ 2021-08-14 14:22 UTC (permalink / raw)
  To: GCC Development, Gabriel Ravier

"Gabriel Ravier" <gabravier@gmail.com> wrote:

Please don't FULL QUOTE!

> On 8/13/21 8:58 PM, Stefan Kanthak wrote:
>> Hi,
>>
>> compile the following naive implementation of nextafter() for AMD64:
>>
>> JFTR: ignore the aliasing casts, they don't matter here!
>>
>> $ cat repro.c

[...]

> Shouldn't this kind of stuff go to the Bugzilla ?

I don't mind if you or someone else take it there.
If you do so, here's one for the road:

$ cat alternate.c
double nextafter(double from, double to)
{
    if (from == to)
        return to;

    if ((from != from) || (to != to))
        return from + to;

    if (from == 0.0)
        return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;

    unsigned long long ull = *(unsigned long long *) &from;

    if ((from > to) == (from > 0.0))
        ull--;
    else
        ull++;

    return *(double *) &ull;
}
$ gcc -m64 -o- -O3 -S alternate.c
nextafter:
        ucomisd %xmm0, %xmm1 # sets PF if comparands are unordered,
                             #  i.e. at least one is a NaN
        jnp     .L20         # OUCH: this branch is unlikely taken!
.L13:                        # reached only if at least one
                             #  argument is a NaN!
        ucomisd %xmm0, %xmm0 # OUCH: SUPERFLUOUS!
        jnp     .L21         # OUCH: SUPERFLUOUS!
.L14:
        addsd   %xmm1, %xmm0
        ret
.L21:                        # reached only if at least one
                             #  argument is a NaN!
        ucomisd %xmm1, %xmm1 # OUCH: SUPERFLUOUS!
        jp      .L14         # OUCH: SUPERFLUOUS!
        pxor    %xmm2, %xmm2
        ucomisd %xmm2, %xmm0
        jnp     .L22         # OUCH: SUPERFLUOUS!
                             #       xmm0 can't be NaN here!
                             # OUCH: and if it were, this branch
                             #        would unlikely be taken!
.L8:
        comisd  %xmm0, %xmm1
        movq    %xmm0, %rdx
        leaq    -1(%rdx), %rax
        seta    %r8b
        comisd  %xmm0, %xmm2
        seta    %cl
        addq    $1, %rdx
        cmpb    %cl, %r8b
        cmovne  %rdx, %rax
        movq    %rax, %xmm0
        ret
.L20:
        jne     .L13
        movapd  %xmm1, %xmm0
        ret
.L22:
        jne     .L8
        movabsq $-9223372036854775808, %rdx
        movq    %xmm1, %rax
        andq    %rdx, %rax
        orq     $1, %rax
        movq    %rax, %xmm0
        ret

Stefan

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: 3rd deficiency (was: Superfluous branches due to insufficient flow analysis)
  2021-08-13 19:22 ` Gabriel Ravier
  2021-08-14 14:22   ` Stefan Kanthak
@ 2021-08-14 17:10   ` Stefan Kanthak
  1 sibling, 0 replies; 4+ messages in thread
From: Stefan Kanthak @ 2021-08-14 17:10 UTC (permalink / raw)
  To: GCC Development, Gabriel Ravier

Gabriel Ravier <gabravier@gmail.com> wrote:

Independent from the defunct flow analysis in the presence of NaNs, my
example demonstrates another minor deficiency: know thy instruction set!
See the comments in the assembly below.

> On 8/13/21 8:58 PM, Stefan Kanthak wrote:
>> Hi,
>>
>> compile the following naive implementation of nextafter() for AMD64:
>>
>> JFTR: ignore the aliasing casts, they don't matter here!
>>
>> $ cat repro.c
>> double nextafter(double from, double to)
>> {
>>      if (to != to)
>>          return to;        // to is NAN
>>
>>      if (from != from)
>>          return from;      // from is NAN
>>
>>      if (from == to)       // neither from nor to can be NAN here!
>>          return to;
>>
>>      if (from == 0.0)      // dito!
>>          return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;
>>
>>      unsigned long long ull = *(unsigned long long *) &from;
>>
>>      if ((from < to) == (from < 0.0))
>>          ull--;
>>      else
>>          ull++;
>>
>>      return *(double *) &ull;
>> }
>> $ gcc -m64 -o- -O3 -S repro.c
>> ...
>> nextafter:
[...]
>> .L4:
>>          comisd  %xmm0, %xmm1    # sets CF if first comparand < second comparand
>>          movq    %xmm0, %rdx
>>          leaq    -1(%rdx), %rax  # superfluous
>>          seta    %r8b            #      sbb %rcx, %rcx
>>          comisd  %xmm0, %xmm2
>>          seta    %cl             #      sbb %rax, %rax
>>          addq    $1, %rdx        #      xor %rcx, %rax
>>          cmpb    %cl, %r8b       #      or  $1, %rax
>>          cmovne  %rdx, %rax      #      add %rdx, %rax
>>          movq    %rax, %xmm0

Stefan

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-08-14 17:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-13 18:58 Superfluous branches due to insufficient flow analysis Stefan Kanthak
2021-08-13 19:22 ` Gabriel Ravier
2021-08-14 14:22   ` Stefan Kanthak
2021-08-14 17:10   ` 3rd deficiency (was: Superfluous branches due to insufficient flow analysis) Stefan Kanthak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).