From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from eggs.gnu.org (eggs.gnu.org [IPv6:2001:470:142:3::10]) by sourceware.org (Postfix) with ESMTPS id 55D7B3858409 for ; Mon, 9 Jan 2023 11:50:55 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 55D7B3858409 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=nexgo.de Authentication-Results: sourceware.org; spf=fail smtp.mailfrom=nexgo.de Received: from mr4.vodafonemail.de ([145.253.228.164]) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1pEqfz-00050O-B7 for gcc@gnu.org; Mon, 09 Jan 2023 06:50:53 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=nexgo.de; s=vfde-smtpout-mb-15sep; t=1673265047; bh=1obXF/PWLquEwpExvG7xy/fXTrgONLXbsf9x15WsvaI=; h=Message-ID:From:To:Subject:Date:Content-Type:X-Mailer:From; b=lTpMG7FAitknIASQGNZjA1WL30W43RgbxqlPertRQOj89Nz6JtTrQUotmbEO9Z2Fo xVjGOst9J5Tjqzl0a0BaA+y0zQcJCftixxSbt+C+tRM/APRNLW0GI8Ypr378RaX9vB n3PJN7KQp0MgF7kjxMncmqOGSByyF1w4cusalRK4= Received: from smtp.vodafone.de (unknown [10.0.0.2]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by mr4.vodafonemail.de (Postfix) with ESMTPS id 4NrC1z5kfRz1y5R for ; Mon, 9 Jan 2023 11:50:45 +0000 (UTC) Received: from H270 (p5de6d091.dip0.t-ipconnect.de [93.230.208.145]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-SHA384 (256/256 bits)) (No client certificate requested) by smtp.vodafone.de (Postfix) with ESMTPSA id 4NrC1s0kxSzMkry for ; Mon, 9 Jan 2023 11:50:37 +0000 (UTC) Message-ID: <77F37AA2F7464541AFD2DCAA8C688C0F@H270> From: "Stefan Kanthak" To: Subject: EPIC optimiser failures (i386) Date: Mon, 9 Jan 2023 12:49:55 +0100 Organization: Me, myself & IT MIME-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: 8bit X-Priority: 3 X-MSMail-Priority: Normal X-Mailer: Microsoft Windows Mail 6.0.6002.18197 X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158 X-purgate-type: clean X-purgate: clean X-purgate-size: 5187 X-purgate-ID: 155817::1673265041-797F94D1-75664DD6/0/0 Received-SPF: pass client-ip=145.253.228.164; envelope-from=stefan.kanthak@nexgo.de; helo=mr4.vodafonemail.de X-Spam_score_int: -27 X-Spam_score: -2.8 X-Spam_bar: -- X-Spam_report: (-2.8 / 5.0 requ) BAYES_00=-1.9,DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,DKIM_VALID_EF=-0.1,RCVD_IN_DNSWL_LOW=-0.7,RCVD_IN_MSPIKE_H2=-0.001,SPF_HELO_NONE=0.001,SPF_PASS=-0.001 autolearn=ham autolearn_force=no X-Spam_action: no action X-Spam-Status: No, score=-1.9 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,SCC_5_SHORT_WORD_LINES,SPF_FAIL,SPF_HELO_PASS,TXREP autolearn=no autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: Hi, compile the following routine for the i386 processor, with optimisation: --- double.c --- // IEEE-754 binary64 double-precision floating point // binary64 != ±0.0 -> 0 // binary64 == +0.0 -> +1 // binary64 == -0.0 -> -1 int plusminus0(unsigned long long binary64) { if (binary64 != -binary64) // neither +0.0 nor -0.0 return 0; if (binary64 == 0) return 1; return -1; } --- EOF --- GCC 12.2 gcc -m32 -O2 double.c # https://godbolt.org/z/17as1M1xM plusminus0(unsigned long long): push esi push ebx mov ecx, DWORD PTR [esp+12] mov ebx, DWORD PTR [esp+16] mov eax, ecx neg eax mov edx, ebx adc edx, 0 xor eax, ecx neg edx xor edx, ebx or eax, edx jne .L5 or ecx, ebx pop ebx cmp ecx, 1 sbb esi, esi and esi, 2 sub esi, 1 mov eax, esi pop esi ret .L5: xor esi, esi pop ebx mov eax, esi pop esi ret OUCH: these 27 instructions in 56 bytes are as BAD^WHORRIBLE as code could get! EVERY optimising^Wcompiler writer should be aware that if (binary64 == -binary64) is just a shorthand for if (binary64 == 0 - binary64) and thus equivalent to if (binary64 + binary64 == 0) which SHOULD lead to the following (optionally branch-free) code: mov ecx, dword ptr [esp+4] mov edx, dword ptr [esp+8] # edx:ecx = binary64 add ecx, ecx adc edx, edx sbb eax, eax # eax = (binary64 < 0) ? -1 : 0 .ifnotdef BRANCHFREE or ecx, edx jz .L0 # binary64 == -binary64? stc # CF = 1 adc eax, eax # eax = (binary64 < 0) ? -1 : 1 .L0: .else stc # CF = 1 adc eax, eax # eax = (binary64 < 0) ? -1 : 1 or ecx, edx # ecx = (binary64 == -binary64) ? 0 : * neg ecx # CF = (binary64 != -binary64) sbb ecx, ecx # ecx = (binary64 != -binary64) ? -1 : 0 not ecx # ecx = (binary64 == -binary64) ? -1 : 0 and eax, ecx .endif ret Either 10 instructions in 22 bytes or 13 instructions in 28 bytes, i.e. less than half the instructions and bytes! Since the lower half of the binary64 only needs to be tested against 0, a TRUE optimising compiler would but come up with the following code: mov eax, dword ptr [esp+8] # upper half of binary64 cdq # edx = (binary64 < 0) ? -1 : 0 stc # CF = 1 adc edx, edx # edx = (binary64 < 0) ? -1 : 1 add eax, eax or eax, dword ptr [esp+4] neg eax # CF = (binary64 != -binary64) sbb eax, eax # eax = (binary64 != -binary64) ? -1 : 0 not eax # eax = (binary64 == -binary64) ? -1 : 0 and eax, edx ret 11 instructions in 23 bytes. --- single.c --- // IEEE-754 binary32 single-precision floating point int plusminus0(unsigned long binary32) { if (binary32 != -binary32) // neither +0.0 nor -0.0 return 0; if (binary32 == 0) return 1; return -1; } --- EOF --- GCC 12.2 gcc -m32 -O2 single.c # https://godbolt.org/z/djT748e81 plusminus0(unsigned int): mov edx, DWORD PTR [esp+4] xor eax, eax mov ecx, edx neg ecx cmp ecx, edx jne .L1 cmp ecx, 1 sbb eax, eax and eax, 2 sub eax, 1 .L1: ret OOPS (11 instructions in 26 bytes)! An optimising compiler SHOULD but generate 8 instructions in 16 bytes: xor eax, eax mov ecx, DWORD PTR [esp+4] add ecx, ecx jnz .L1 # binary32 != -binary32? sbb eax, eax # eax = (binary32 < 0) ? -1 : 0 stc # CF = 1 adc eax, eax # eax = (binary32 < 0) ? -1 : 1 .L1: ret A TRUE optimising compiler would butgenerate the following branch-free code, using 7 or 8 instructions in 19 or 18 bytes: .if 0 mov eax, DWORD PTR [esp+4] neg eax # OF = (binary32 == -0.0), # ZF = (binary32 == +0.0) .else xor eax, eax sub eax, DWORD PTR [esp+4] .endif seto ah setz al sub al, ah # al = ZF - OF .if 0 cbw cwde .else movsx eax, al .endif ret Stefan Kanthak