From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from eggs.gnu.org (eggs.gnu.org [IPv6:2001:470:142:3::10]) by sourceware.org (Postfix) with ESMTPS id A82BF3858D37 for ; Mon, 9 Jan 2023 11:38:29 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org A82BF3858D37 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=nexgo.de Authentication-Results: sourceware.org; spf=fail smtp.mailfrom=nexgo.de Received: from mr3.vodafonemail.de ([145.253.228.163]) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1pEqTx-0007ui-J5 for gcc@gnu.org; Mon, 09 Jan 2023 06:38:28 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=nexgo.de; s=vfde-smtpout-mb-15sep; t=1673264291; bh=4Eq/Cswq8N0qlyzDmAU0c09vG5rLIgfTAec2W2tVkNw=; h=Message-ID:From:To:Subject:Date:Content-Type:X-Mailer:From; b=BYyFht8Pv63cRsXbS4rEpR5ASz3PCfKwnCV4wo4N9HEQIW9pNQUIhNWckOFiqF/pe CZVMJxfI+CIYnFJmWmCa7Pfw2G0ED7Pn/xUPWNrIbLofJQ8rb5OHfZICaFRwFHvire kHzV03IPKFz7211C2DA11FJkwfVxwVBs4fUbE85E= Received: from smtp.vodafone.de (unknown [10.0.0.2]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by mr3.vodafonemail.de (Postfix) with ESMTPS id 4NrBlR4g9jz215d for ; Mon, 9 Jan 2023 11:38:10 +0000 (UTC) Received: from H270 (p5de6d091.dip0.t-ipconnect.de [93.230.208.145]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-SHA384 (256/256 bits)) (No client certificate requested) by smtp.vodafone.de (Postfix) with ESMTPSA id 4NrBlL1QWjz9sL7 for ; Mon, 9 Jan 2023 11:38:00 +0000 (UTC) Message-ID: <1E8B7BDD934E435E95A9FF0C64709165@H270> From: "Stefan Kanthak" To: Subject: B^HDEAD code generation (i386) Date: Mon, 9 Jan 2023 12:31:55 +0100 Organization: Me, myself & IT MIME-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: 7bit X-Priority: 3 X-MSMail-Priority: Normal X-Mailer: Microsoft Windows Mail 6.0.6002.18197 X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158 X-purgate-type: clean X-purgate: clean X-purgate-size: 9108 X-purgate-ID: 155817::1673264286-3B7FD404-7C182EDC/0/0 Received-SPF: pass client-ip=145.253.228.163; envelope-from=stefan.kanthak@nexgo.de; helo=mr3.vodafonemail.de X-Spam_score_int: -27 X-Spam_score: -2.8 X-Spam_bar: -- X-Spam_report: (-2.8 / 5.0 requ) BAYES_00=-1.9,DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,DKIM_VALID_EF=-0.1,RCVD_IN_DNSWL_LOW=-0.7,SPF_HELO_NONE=0.001,SPF_PASS=-0.001 autolearn=ham autolearn_force=no X-Spam_action: no action X-Spam-Status: No, score=1.3 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,SCC_10_SHORT_WORD_LINES,SCC_20_SHORT_WORD_LINES,SCC_35_SHORT_WORD_LINES,SCC_5_SHORT_WORD_LINES,SPF_FAIL,SPF_HELO_PASS,TXREP autolearn=no autolearn_force=no version=3.4.6 X-Spam-Level: * X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: Hi, compile the following 64-bit GCD routine for the i386 processor, with full optimization and the preprocessor macro CTZ defined: --- gcddi3.c --- // Stein's algorithm: greatest common divisor unsigned long long __gcddi3(unsigned long long p, unsigned long long q) { unsigned r, s = 0; unsigned long long t; if (p == 0) return q; if (q == 0) return p; if (p == q) return p; #ifndef CTZ while (((p | q) & 1) == 0) p >>= 1, q >>= 1, s++; while ((p & 1) == 0) p >>= 1; do { while ((q & 1) == 0) q >>= 1; #elif CTZ != 32 s = __builtin_ctzll(p | q), p >>= s, q >>= s; r = __builtin_ctzll(p), p >>= r; do { r = __builtin_ctzll(q), q >>= r; #else if (((unsigned long) p | (unsigned long) q) == 0) p >>= 32, q >>= 32, s = 32; r = __builtin_ctzl((unsigned long) p | (unsigned long) q), p >>= r, q >>= r, s += r; if ((unsigned long) p == 0) p >>= 32; r = __builtin_ctzl(p), p >>= r; do { if ((unsigned long) q == 0) q >>= 32; r = __builtin_ctzl(q), q >>= r; #endif if (p < q) t = q, q = p, p = t; } while (q -= p); return p << s; } --- EOF --- GCC 12.2: gcc -DCTZ -m32 -mno-sse -O3 gcddi3.c # https://godbolt.org/z/c7cao8M57 __gcddi3(unsigned long long, unsigned long long): push ebp push edi push esi push ebx sub esp, 28 mov ebx, DWORD PTR [esp+52] mov ecx, DWORD PTR [esp+48] mov esi, DWORD PTR [esp+56] mov edi, DWORD PTR [esp+60] mov ebp, ebx mov DWORD PTR [esp+8], ecx or ebp, ecx mov DWORD PTR [esp+12], ebx mov eax, esi mov edx, edi je .L1 mov eax, ecx mov edx, ebx xor eax, esi xor edx, edi or eax, edx je .L6 mov eax, esi or eax, edi je .L6 mov eax, ecx mov edx, ebx sub esp, 8 xor ebp, ebp or edx, edi or eax, esi push edx # push eax # call __ctzdi2 # Oops: shouldn't this be inlined with -O3?! pop edx # pop ecx # mov ebx, eax mov edx, DWORD PTR [esp+20] mov eax, DWORD PTR [esp+16] mov ecx, ebx shrd eax, edx, cl shr edx, cl test bl, 32 cmovne eax, edx cmovne edx, ebp shrd esi, edi, cl xor ebp, ebp shr edi, cl test bl, 32 mov DWORD PTR [esp+16], eax cmovne esi, edi cmovne edi, ebp mov DWORD PTR [esp+20], edx push edx # push eax # call __ctzdi2 # Oops: shouldn't this be inlined with -O3?! mov edx, DWORD PTR [esp+28] add esp, 16 mov ebp, eax mov eax, DWORD PTR [esp+8] mov ecx, ebp xor ebp, ebp shrd eax, edx, cl shr edx, cl and ecx, 32 cmovne eax, edx cmovne edx, ebp mov DWORD PTR [esp+8], eax mov DWORD PTR [esp+12], edx .L4: sub esp, 8 xor ebp, ebp push edi # push esi # call __ctzdi2 # Oops: shouldn't this be inlined with -O3?! add esp, 16 mov edx, DWORD PTR [esp+12] mov ecx, eax shrd esi, edi, cl shr edi, cl test al, 32 mov eax, DWORD PTR [esp+8] cmovne esi, edi cmovne edi, ebp mov ecx, edx cmp eax, esi sbb ecx, edi jnc .L3 mov DWORD PTR [esp+8], esi mov esi, eax mov DWORD PTR [esp+12], edi mov edi, edx .L3: sub esi, DWORD PTR [esp+8] sbb edi, DWORD PTR [esp+12] mov eax, edi or eax, esi jne .L4 mov eax, DWORD PTR [esp+8] mov edx, DWORD PTR [esp+12] mov ecx, ebx xor ebx, ebx shld edx, eax, cl sal eax, cl and ecx, 32 cmovne edx, eax cmovne eax, ebx .L1: add esp, 28 pop ebx pop esi pop edi pop ebp ret .L6: mov eax, DWORD PTR [esp+8] mov edx, DWORD PTR [esp+12] add esp, 28 pop ebx pop esi pop edi pop ebp ret Compile it again, now with the preprocessor macro CTZ=32 defined to avoid/inline the calls of __ctzdi2: GCC 12.2: gcc -DCTZ=32 -m32 -mno-sse -O3 gcddi3.c # https://godbolt.org/z/deo65387b __gcddi3(unsigned long long, unsigned long long): push ebp push edi push esi push ebx mov edi, DWORD PTR [esp+24] mov esi, DWORD PTR [esp+20] mov eax, DWORD PTR [esp+28] mov edx, DWORD PTR [esp+32] mov ebp, edi or ebp, esi mov ecx, eax mov ebx, edx je .L1 mov ecx, esi mov ebx, edi xor ecx, eax xor ebx, edx or ecx, ebx je .L9 mov ebx, eax or ebx, edx je .L9 mov ecx, esi xor ebx, ebx or ecx, eax jne .L3 mov esi, edi mov eax, edx xor edi, edi xor edx, edx mov ecx, esi mov ebx, 32 or ecx, eax .L3: rep bsf ecx, ecx xor ebp, ebp # OUCH: BSF and TZCNT return at most 31, shrd esi, edi, cl shr edi, cl test cl, 32 # so this is dead code! cmovne esi, edi # cmovne edi, ebp # shrd eax, edx, cl xor ebp, ebp # OUCH: EBP is already 0 here, shr edx, cl test cl, 32 # and BSF and TZCNT return at most 31, cmovne eax, edx # so this is dead code! cmovne edx, ebp # lea ebp, [ebx+ecx] mov ecx, esi # Oops: superfluous test esi, esi jne .L4 mov esi, edi xor edi, edi mov ecx, esi # Oops: superfluous .L4: rep bsf ecx, ecx # Oops: should be BSF ECX, ESI xor ebx, ebx # OUCH: BSF and TZCNT return at most 31, shrd esi, edi, cl shr edi, cl and ecx, 32 # and there's no need to modify ECX here, cmovne esi, edi # so this is dead code! cmovne edi, ebx # .L7: mov ecx, eax # Oops: superfluous test eax, eax jne .L5 mov eax, edx xor edx, edx mov ecx, eax # Oops: superfluous .L5: rep bsf ecx, ecx # Oops: should be BSF ECX, EAX xor ebx, ebx # OUCH: BSF and TZCNT return at most 31, shrd eax, edx, cl shr edx, cl test cl, 32 # so this is dead code! cmovne eax, edx # cmovne edx, ebx # mov ebx, edi cmp esi, eax sbb ebx, edx jnc .L6 mov ecx, esi mov ebx, edi mov esi, eax mov edi, edx mov eax, ecx mov edx, ebx .L6: sub eax, esi sbb edx, edi mov ebx, edx or ebx, eax jne .L7 mov ecx, ebp xor eax, eax shld edi, esi, cl sal esi, cl test cl, 32 cmovne edi, esi cmovne esi, eax mov ebx, edi mov ecx, esi .L1: mov edx, ebx mov eax, ecx pop ebx pop esi pop edi pop ebp ret .L9: mov ebx, edi # Ouch: GCC likes to play shell games! mov ecx, esi # mov edx, ebx # mov eax, ecx # pop ebx pop esi pop edi pop ebp ret 22 superfluous instructions out of the total 104 instructions NOT AMUSED Stefan Kanthak PS: shows properly written assembly code.