From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <SRS0=+Y3u=5G=nexgo.de=stefan.kanthak@sourceware.org>
Received: from eggs.gnu.org (eggs.gnu.org [IPv6:2001:470:142:3::10])
	by sourceware.org (Postfix) with ESMTPS id 1FC743858417
	for <gcc@gcc.gnu.org>; Mon,  9 Jan 2023 12:31:40 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 1FC743858417
Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=nexgo.de
Authentication-Results: sourceware.org; spf=fail smtp.mailfrom=nexgo.de
Received: from mr5.vodafonemail.de ([145.253.228.165])
	by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)
	(Exim 4.90_1)
	(envelope-from <stefan.kanthak@nexgo.de>)
	id 1pErJR-0007sO-Kh
	for gcc@gnu.org; Mon, 09 Jan 2023 07:31:39 -0500
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=nexgo.de;
	s=vfde-smtpout-mb-15sep; t=1673267494;
	bh=ZszK40HSq0A7cm5nYd0ashxy8iTqn03hor9bEwqiuuM=;
	h=Message-ID:From:To:Subject:Date:Content-Type:X-Mailer:From;
	b=QiSMEKmxvw67pW4aQbiQqjxs8J6LzlHKQ96agJa1TwaZJ+KuJehrKk/LAb6a4owVS
	 zELPvUwxNZyDbSTKcBVEi6/k+2lkiseEL3LyDTlsBH8KP4b+gFYY5IRuj/jSSNsRtp
	 q8cSX3r8Fsq0BHgqvq2JA4saluz1jR/cZoO8xmAw=
Received: from smtp.vodafone.de (unknown [10.0.0.2])
	(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
	 key-exchange X25519 server-signature RSA-PSS (2048 bits))
	(No client certificate requested)
	by mr5.vodafonemail.de (Postfix) with ESMTPS id 4NrCx21H4fz1yVM
	for <gcc@gnu.org>; Mon,  9 Jan 2023 12:31:33 +0000 (UTC)
Received: from H270 (p5de6d091.dip0.t-ipconnect.de [93.230.208.145])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.vodafone.de (Postfix) with ESMTPSA id 4NrCww5h01zHrFD
	for <gcc@gnu.org>; Mon,  9 Jan 2023 12:31:25 +0000 (UTC)
Message-ID: <554A1354252F43BB8915A74129C41BE3@H270>
From: "Stefan Kanthak" <stefan.kanthak@nexgo.de>
To: <gcc@gnu.org>
Subject: Widening multiplication, but no narrowing division [i386/AMD64]
Date: Mon, 9 Jan 2023 13:20:28 +0100
Organization: Me, myself & IT
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 8bit
X-Priority: 3
X-MSMail-Priority: Normal
X-Mailer: Microsoft Windows Mail 6.0.6002.18197
X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158
X-purgate-type: clean
X-purgate: clean
X-purgate-size: 4310
X-purgate-ID: 155817::1673267488-357F94F5-B86004F7/0/0
Received-SPF: pass client-ip=145.253.228.165; envelope-from=stefan.kanthak@nexgo.de; helo=mr5.vodafonemail.de
X-Spam_score_int: -27
X-Spam_score: -2.8
X-Spam_bar: --
X-Spam_report: (-2.8 / 5.0 requ) BAYES_00=-1.9,DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,DKIM_VALID_EF=-0.1,RCVD_IN_DNSWL_LOW=-0.7,SPF_HELO_NONE=0.001,SPF_PASS=-0.001 autolearn=ham autolearn_force=no
X-Spam_action: no action
X-Spam-Status: No, score=-2.5 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,SPF_FAIL,SPF_HELO_PASS,TXREP autolearn=no autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <gcc.gcc.gnu.org>

Hi,

GCC (and other C compilers too) support the widening multiplication
of i386/AMD64 processors, but DON'T support their narrowing division:

--- demo.c ---
unsigned long long product(unsigned long multiplicand,
                           unsigned long multiplier)
{
    return (unsigned long long) multiplicand * multiplier;
}

unsigned long long quotient(unsigned long long dividend,
                            unsigned long divisor,
                            unsigned long *remainder)
{
    *remainder = dividend % divisor;
    return dividend / divisor;
}
--- EOF ---

GCC 12.2: gcc -m32 -O2 demo.c

# https://godbolt.org/z/1M9dohMcE
product(unsigned long, unsigned long):
        mov     eax, DWORD PTR [esp+8]
        mul     DWORD PTR [esp+4]
        ret
quotient(unsigned long long, unsigned long, unsigned long*):
        push    ebx
        xor     edx, edx
        sub     esp, 24
        mov     eax, DWORD PTR [esp+40]
        lea     ecx, [esp+8]
        sub     esp, 12
        push    ecx
        push    edx
        push    eax
        push    DWORD PTR [esp+60]
        push    DWORD PTR [esp+60]
        call    __udivmoddi4
        mov     ebx, DWORD PTR [esp+40]
        mov     ecx, DWORD PTR [esp+76]
        mov     DWORD PTR [ecx], ebx
        add     esp, 56
        pop     ebx
        ret

### Diversion ###

Even worse and completely BRAINDEAD, another compiler calls __udivdi3()
and wastes a multiplication to compute the remainder, ignoring the fact
that __udivdi3() calls __udivmoddi4() which already returns quotient
and remainder:

clang 15.0.0: clang -m32 -O2 demo.c

# https://godbolt.org/z/rv1sTe7xv
product(unsigned long, unsigned long):
        mov     eax, dword ptr [esp + 8]
        mul     dword ptr [esp + 4]
        ret
quotient(unsigned long long, unsigned long, unsigned long*):
        push    ebp
        push    ebx
        push    edi
        push    esi
        sub     esp, 12
        call    .L1$pb
.L1$pb:
        pop     ebx
.Ltmp2:
        add     ebx, offset _GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L1$pb)
        mov     esi, dword ptr [esp + 44]
        mov     edi, dword ptr [esp + 32]
        mov     ebp, dword ptr [esp + 40]
        push    0
        push    ebp
        push    dword ptr [esp + 44]
        push    edi
        call    __udivdi3@PLT
        add     esp, 16
        imul    ebp, eax
        sub     edi, ebp
        mov     dword ptr [esi], edi
        add     esp, 12
        pop     esi
        pop     edi
        pop     ebx
        pop     ebp
        ret

### end of diversion ###

Both compilers miss the fact that the i386 processor has a narrowing
integer division and can therefore divide 64-bit / 32-bit numbers,
for example with the well-known "long" alias "schoolbook" division,
returning 64-bit quotient and 32-bit remainder:

.arch   generic 32
.code32
.intel_syntax noprefix
.text

quotient:
        mov     ecx, [esp+12]   # ecx = divisor
.if 0
        xor     edx, edx
        mov     eax, [esp+8]    # edx:eax = high dword of dividend
        cmp     eax, edx
        je      0f              # high dword of dividend = 0?
.else
        xor     eax, eax
        mov     edx, [esp+8]    # eax:edx = high dword of dividend
        cmp     edx, ecx
        jb      0f              # high dword of dividend < divisor?
                                # quotient < 2**32?
        xchg    eax, edx
.endif
        div     ecx             # eax = high dword of quotient,
0:                              # edx = high dword of dividend'
        push    eax
        mov     eax, [esp+8]    # edx:eax = dividend'
        div     ecx             # eax = low dword of quotient,
                                # edx = remainder
        mov     ecx, remainder  # ecx = address of remainder
        mov     [ecx], edx
        pop     edx             # edx:eax = quotient
        ret
.end

JFTR: dependent on the magnitude of the numbers and the processor
      it MIGHT be better to omit comparison and branch: there's a
      trade-öff between the latency of the (un-pipelined) division
      instruction and the latency of the conditional branch due to
      misprediction.

Stefan Kanthak