B^HDEAD code generation (i386)

public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed

* B^HDEAD code generation (i386)
@ 2023-01-09 11:31 Stefan Kanthak
  0 siblings, 0 replies; only message in thread
From: Stefan Kanthak @ 2023-01-09 11:31 UTC (permalink / raw)
  To: gcc

Hi,

compile the following 64-bit GCD routine for the i386 processor,
with full optimization and the preprocessor macro CTZ defined:

--- gcddi3.c ---
// Stein's algorithm: greatest common divisor

unsigned long long __gcddi3(unsigned long long p, unsigned long long q)
{
    unsigned           r, s = 0;
    unsigned long long t;

    if (p == 0)
        return q;

    if (q == 0)
        return p;

    if (p == q)
        return p;
#ifndef CTZ
    while (((p | q) & 1) == 0)
        p >>= 1, q >>= 1, s++;

    while ((p & 1) == 0)
        p >>= 1;

    do {
        while ((q & 1) == 0)
            q >>= 1;
#elif CTZ != 32
    s = __builtin_ctzll(p | q), p >>= s, q >>= s;
    r = __builtin_ctzll(p), p >>= r;

    do {
        r = __builtin_ctzll(q), q >>= r;
#else
    if (((unsigned long) p | (unsigned long) q) == 0)
        p >>= 32, q >>= 32, s = 32;

    r = __builtin_ctzl((unsigned long) p | (unsigned long) q), p >>= r, q >>= r, s += r;

    if ((unsigned long) p == 0)
        p >>= 32;

    r = __builtin_ctzl(p), p >>= r;

    do
    {
        if ((unsigned long) q == 0)
            q >>= 32;

        r = __builtin_ctzl(q), q >>= r;
#endif
        if (p < q)
            t = q, q = p, p = t;
    } while (q -= p);

    return p << s;
}
--- EOF ---

GCC 12.2: gcc -DCTZ -m32 -mno-sse -O3 gcddi3.c

# https://godbolt.org/z/c7cao8M57
__gcddi3(unsigned long long, unsigned long long):
        push    ebp
        push    edi
        push    esi
        push    ebx
        sub     esp, 28
        mov     ebx, DWORD PTR [esp+52]
        mov     ecx, DWORD PTR [esp+48]
        mov     esi, DWORD PTR [esp+56]
        mov     edi, DWORD PTR [esp+60]
        mov     ebp, ebx
        mov     DWORD PTR [esp+8], ecx
        or      ebp, ecx
        mov     DWORD PTR [esp+12], ebx
        mov     eax, esi
        mov     edx, edi
        je      .L1
        mov     eax, ecx
        mov     edx, ebx
        xor     eax, esi
        xor     edx, edi
        or      eax, edx
        je      .L6
        mov     eax, esi
        or      eax, edi
        je      .L6
        mov     eax, ecx
        mov     edx, ebx
        sub     esp, 8
        xor     ebp, ebp
        or      edx, edi
        or      eax, esi
        push    edx        #
        push    eax        #
        call    __ctzdi2   # Oops: shouldn't this be inlined with -O3?!
        pop     edx        #
        pop     ecx        #
        mov     ebx, eax
        mov     edx, DWORD PTR [esp+20]
        mov     eax, DWORD PTR [esp+16]
        mov     ecx, ebx
        shrd    eax, edx, cl
        shr     edx, cl
        test    bl, 32
        cmovne  eax, edx
        cmovne  edx, ebp
        shrd    esi, edi, cl
        xor     ebp, ebp
        shr     edi, cl
        test    bl, 32
        mov     DWORD PTR [esp+16], eax
        cmovne  esi, edi
        cmovne  edi, ebp
        mov     DWORD PTR [esp+20], edx
        push    edx        #
        push    eax        #
        call    __ctzdi2   # Oops: shouldn't this be inlined with -O3?!
        mov     edx, DWORD PTR [esp+28]
        add     esp, 16
        mov     ebp, eax
        mov     eax, DWORD PTR [esp+8]
        mov     ecx, ebp
        xor     ebp, ebp
        shrd    eax, edx, cl
        shr     edx, cl
        and     ecx, 32
        cmovne  eax, edx
        cmovne  edx, ebp
        mov     DWORD PTR [esp+8], eax
        mov     DWORD PTR [esp+12], edx
.L4:
        sub     esp, 8
        xor     ebp, ebp
        push    edi        #
        push    esi        #
        call    __ctzdi2   # Oops: shouldn't this be inlined with -O3?!
        add     esp, 16
        mov     edx, DWORD PTR [esp+12]
        mov     ecx, eax
        shrd    esi, edi, cl
        shr     edi, cl
        test    al, 32
        mov     eax, DWORD PTR [esp+8]
        cmovne  esi, edi
        cmovne  edi, ebp
        mov     ecx, edx
        cmp     eax, esi
        sbb     ecx, edi
        jnc     .L3
        mov     DWORD PTR [esp+8], esi
        mov     esi, eax
        mov     DWORD PTR [esp+12], edi
        mov     edi, edx
.L3:
        sub     esi, DWORD PTR [esp+8]
        sbb     edi, DWORD PTR [esp+12]
        mov     eax, edi
        or      eax, esi
        jne     .L4
        mov     eax, DWORD PTR [esp+8]
        mov     edx, DWORD PTR [esp+12]
        mov     ecx, ebx
        xor     ebx, ebx
        shld    edx, eax, cl
        sal     eax, cl
        and     ecx, 32
        cmovne  edx, eax
        cmovne  eax, ebx
.L1:
        add     esp, 28
        pop     ebx
        pop     esi
        pop     edi
        pop     ebp
        ret
.L6:
        mov     eax, DWORD PTR [esp+8]
        mov     edx, DWORD PTR [esp+12]
        add     esp, 28
        pop     ebx
        pop     esi
        pop     edi
        pop     ebp
        ret


Compile it again, now with the preprocessor macro CTZ=32 defined
to avoid/inline the calls of __ctzdi2:

GCC 12.2: gcc -DCTZ=32 -m32 -mno-sse -O3 gcddi3.c

# https://godbolt.org/z/deo65387b
__gcddi3(unsigned long long, unsigned long long):
        push    ebp
        push    edi
        push    esi
        push    ebx
        mov     edi, DWORD PTR [esp+24]
        mov     esi, DWORD PTR [esp+20]
        mov     eax, DWORD PTR [esp+28]
        mov     edx, DWORD PTR [esp+32]
        mov     ebp, edi
        or      ebp, esi
        mov     ecx, eax
        mov     ebx, edx
        je      .L1
        mov     ecx, esi
        mov     ebx, edi
        xor     ecx, eax
        xor     ebx, edx
        or      ecx, ebx
        je      .L9
        mov     ebx, eax
        or      ebx, edx
        je      .L9
        mov     ecx, esi
        xor     ebx, ebx
        or      ecx, eax
        jne     .L3
        mov     esi, edi
        mov     eax, edx
        xor     edi, edi
        xor     edx, edx
        mov     ecx, esi
        mov     ebx, 32
        or      ecx, eax
.L3:
        rep bsf ecx, ecx
        xor     ebp, ebp       # OUCH: BSF and TZCNT return at most 31,
        shrd    esi, edi, cl
        shr     edi, cl
        test    cl, 32         #       so this is dead code!
        cmovne  esi, edi       #
        cmovne  edi, ebp       #
        shrd    eax, edx, cl
        xor     ebp, ebp       # OUCH: EBP is already 0 here,
        shr     edx, cl
        test    cl, 32         #       and BSF and TZCNT return at most 31,
        cmovne  eax, edx       #       so this is dead code!
        cmovne  edx, ebp       #
        lea     ebp, [ebx+ecx]
        mov     ecx, esi       # Oops: superfluous
        test    esi, esi
        jne     .L4
        mov     esi, edi
        xor     edi, edi
        mov     ecx, esi       # Oops: superfluous
.L4:
        rep bsf ecx, ecx       # Oops: should be BSF ECX, ESI
        xor     ebx, ebx       # OUCH: BSF and TZCNT return at most 31,
        shrd    esi, edi, cl
        shr     edi, cl
        and     ecx, 32        #       and there's no need to modify ECX here,
        cmovne  esi, edi       #       so this is dead code!
        cmovne  edi, ebx       #
.L7:
        mov     ecx, eax       # Oops: superfluous
        test    eax, eax
        jne     .L5
        mov     eax, edx
        xor     edx, edx
        mov     ecx, eax       # Oops: superfluous
.L5:
        rep bsf ecx, ecx       # Oops: should be BSF ECX, EAX
        xor     ebx, ebx       # OUCH: BSF and TZCNT return at most 31,
        shrd    eax, edx, cl
        shr     edx, cl
        test    cl, 32         #       so this is dead code!
        cmovne  eax, edx       #
        cmovne  edx, ebx       #
        mov     ebx, edi
        cmp     esi, eax
        sbb     ebx, edx
        jnc     .L6
        mov     ecx, esi
        mov     ebx, edi
        mov     esi, eax
        mov     edi, edx
        mov     eax, ecx
        mov     edx, ebx
.L6:
        sub     eax, esi
        sbb     edx, edi
        mov     ebx, edx
        or      ebx, eax
        jne     .L7
        mov     ecx, ebp
        xor     eax, eax
        shld    edi, esi, cl
        sal     esi, cl
        test    cl, 32
        cmovne  edi, esi
        cmovne  esi, eax
        mov     ebx, edi
        mov     ecx, esi
.L1:
        mov     edx, ebx
        mov     eax, ecx
        pop     ebx
        pop     esi
        pop     edi
        pop     ebp
        ret
.L9:
        mov     ebx, edi      # Ouch: GCC likes to play shell games!
        mov     ecx, esi      #
        mov     edx, ebx      #
        mov     eax, ecx      #
        pop     ebx
        pop     esi
        pop     edi
        pop     ebp
        ret

22 superfluous instructions out of the total 104 instructions

NOT AMUSED
Stefan Kanthak

PS: <https://skanthak.homepage.t-online.de/gcc.html#case27-i386-s>
    shows properly written assembly code.

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-01-09 11:38 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-09 11:31 B^HDEAD code generation (i386) Stefan Kanthak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).