From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from eggs.gnu.org (eggs.gnu.org [IPv6:2001:470:142:3::10]) by sourceware.org (Postfix) with ESMTPS id 399EB3858D20 for ; Mon, 9 Jan 2023 11:38:33 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 399EB3858D20 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=nexgo.de Authentication-Results: sourceware.org; spf=fail smtp.mailfrom=nexgo.de Received: from mr4.vodafonemail.de ([145.253.228.164]) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1pEqU2-0007vh-OO for gcc@gnu.org; Mon, 09 Jan 2023 06:38:32 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=nexgo.de; s=vfde-smtpout-mb-15sep; t=1673264298; bh=WtntDq+TBppHccU6mx952ruKuM3CV1YoTOBDD8j4d5o=; h=Message-ID:From:To:Subject:Date:Content-Type:X-Mailer:From; b=ICnaWpAsNHt1ptTreum4Em/ZdtzKSWrSS2UsKizoRwNWWUq+b8XQ9R9dpG6jYSC9l whoZu1L/z0uL5hOWAFPFUPe8XrwB8CudDanli4oaq4rh7Hvb/UPvaTdckuwfrERHPW keMDC2mUy5fNiF9ZNZyFMZyWLGxgw0ZW7q3HCM4E= Received: from smtp.vodafone.de (unknown [10.0.0.2]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by mr4.vodafonemail.de (Postfix) with ESMTPS id 4NrBlZ1mQyz20pv for ; Mon, 9 Jan 2023 11:38:18 +0000 (UTC) Received: from H270 (p5de6d091.dip0.t-ipconnect.de [93.230.208.145]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-SHA384 (256/256 bits)) (No client certificate requested) by smtp.vodafone.de (Postfix) with ESMTPSA id 4NrBlT5gcmz9w12 for ; Mon, 9 Jan 2023 11:38:10 +0000 (UTC) Message-ID: <76C395596DD94CCA94BF821C6100FF26@H270> From: "Stefan Kanthak" To: Subject: B^HDEAD code generation (AMD64) Date: Mon, 9 Jan 2023 12:35:24 +0100 Organization: Me, myself & IT MIME-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: 7bit X-Priority: 3 X-MSMail-Priority: Normal X-Mailer: Microsoft Windows Mail 6.0.6002.18197 X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158 X-purgate-type: clean X-purgate: clean X-purgate-size: 4457 X-purgate-ID: 155817::1673264293-3B7FD404-344E98FA/0/0 Received-SPF: pass client-ip=145.253.228.164; envelope-from=stefan.kanthak@nexgo.de; helo=mr4.vodafonemail.de X-Spam_score_int: -27 X-Spam_score: -2.8 X-Spam_bar: -- X-Spam_report: (-2.8 / 5.0 requ) BAYES_00=-1.9,DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,DKIM_VALID_EF=-0.1,RCVD_IN_DNSWL_LOW=-0.7,RCVD_IN_MSPIKE_H2=-0.001,SPF_HELO_NONE=0.001,SPF_PASS=-0.001 autolearn=ham autolearn_force=no X-Spam_action: no action X-Spam-Status: No, score=0.9 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,SCC_10_SHORT_WORD_LINES,SCC_20_SHORT_WORD_LINES,SCC_5_SHORT_WORD_LINES,SPF_FAIL,SPF_HELO_PASS,TXREP autolearn=no autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: Hi, compile the following 128-bit GCD routine for the AMD64 processor with full optimization: --- gcdti3.c --- // Stein's algorithm: greatest common divisor __uint128_t __gcdti3(__uint128_t p, __uint128_t q) { unsigned r, s = 0; __uint128_t t; if (p == 0) return q; if (q == 0) return p; if (p == q) return p; if (((unsigned long long) p | (unsigned long long) q) == 0) p >>= 64, q >>= 64, s = 64; r = __builtin_ctzll((unsigned long long) p | (unsigned long long) q), p >>= r, q >>= r, s += r; if ((unsigned long long) p == 0) p >>= 64; r = __builtin_ctzll(p), p >>= r; do { if ((unsigned long long) q == 0) q >>= 64; r = __builtin_ctzll(q), q >>= r; if (p < q) t = q, q = p, p = t; } while (q -= p); return p << s; } --- EOF --- GCC 12.2: gcc -O3 gcdti3.c # https://godbolt.org/z/d1Ma9qnsf __gcdti3(unsigned __int128, unsigned __int128): mov rax, rsi # OOPS: GCCs plays six rounds of shell game! mov r8, rdi # mov rsi, rdi # mov rdi, rax # mov rax, rdx # mov rdx, rcx # mov rcx, rdi or rcx, r8 je .L1 mov rcx, r8 mov r8, rdi xor rcx, rax xor r8, rdx or rcx, r8 je .L9 mov rcx, rax or rcx, rdx je .L9 mov rcx, rsi xor r10d, r10d or rcx, rax jne .L3 mov rsi, rdi mov rax, rdx xor edi, edi xor edx, edx mov rcx, rsi mov r10d, 64 or rcx, rax .L3: rep bsf rcx, rcx xor r8d, r8d # OUCH: BSF and TZCNT return at most 63, shrd rsi, rdi, cl shr rdi, cl test cl, 64 # so this is dead code! cmovne rsi, rdi # cmovne rdi, r8 # shrd rax, rdx, cl xor r11d, r11d # OUCH: BSF and TZCNT return at most 63, shr rdx, cl test cl, 64 # so this is dead code! cmovne rax, rdx # cmovne rdx, r11 # mov r8, rsi mov r9, rdi add r10d, ecx mov rcx, r8 mov rsi, rax mov rdi, rdx test r8, r8 je .L14 .L4: rep bsf rcx, rcx mov rax, r8 mov rdx, r9 xor r11d, r11d # OUCH: BSF and TZCNT return at most 63, shr rdx, cl shrd rax, r9, cl and ecx, 64 # (there's also no need to modify ECX) cmovne rax, rdx # so this is dead code! cmovne rdx, r11 # .L7: mov rcx, rsi test rsi, rsi jne .L5 mov rsi, rdi xor edi, edi mov rcx, rsi .L5: rep bsf rcx, rcx xor r8d, r8d # OUCH: BSF and TZCNT return at most 63, shrd rsi, rdi, cl shr rdi, cl test cl, 64 # so this is dead code, mov rcx, rdx cmovne rsi, rdi # and that too! cmovne rdi, r8 # cmp rax, rsi sbb rcx, rdi jnc .L6 mov r8, rax mov r9, rdx mov rax, rsi mov rdx, rdi mov rsi, r8 mov rdi, r9 .L6: sub rsi, rax sbb rdi, rdx mov rcx, rdi or rcx, rsi jne .L7 mov ecx, r10d xor esi, esi shld rdx, rax, cl sal rax, cl and ecx, 64 # Oops: there's no need to modify ECX! cmovne rdx, rax cmovne rax, rsi ret .L9: mov rax, rsi mov rdx, rdi .L1: ret .L14: mov r8, r9 xor r9d, r9d mov rcx, r8 jmp .L4 20 superfluous instructions of the total 102 instructions! NOT AMUSED Stefan Kanthak PS: shows properly written assembly code.