From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 736613858D20; Sun, 14 Jan 2024 21:37:53 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 736613858D20 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1705268273; bh=cgVCuy0vN33p5SHWlYW7SP8rBGVJSA0x2duY402L+SA=; h=From:To:Subject:Date:From; b=s3yi9yRJhT7GqQg3DgvYwScbCR3G5OqAlbeSHwAgG2zHAR9B5skeCzd4fhCn1llBH i2XpQ4rF//rDv5AnYIdcSh2FR5T46XCs/9IpimTN8MLFKMqIrp89iU7nDP7guF8m2I 8k4WHE3Lrf8toytWmdNFEVfp8y/dx963Y86Us/g0= From: "llvm at rifkin dot dev" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/113392] New: Missed fold of loading 8 consecutive bytes leading to a missed byteswap optimization Date: Sun, 14 Jan 2024 21:37:52 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: unknown X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: llvm at rifkin dot dev X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D113392 Bug ID: 113392 Summary: Missed fold of loading 8 consecutive bytes leading to a missed byteswap optimization Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: llvm at rifkin dot dev Target Milestone: --- The simple load function uint64_t load64bits(const uint8_t* data) { uint8_t d0 =3D data[0]; uint8_t d1 =3D data[1]; uint8_t d2 =3D data[2]; uint8_t d3 =3D data[3]; uint8_t d4 =3D data[4]; uint8_t d5 =3D data[5]; uint8_t d6 =3D data[6]; uint8_t d7 =3D data[7]; return (uint64_t) d0 | (uint64_t) d1 << 8 | (uint64_t) d2 << 16 | (uint64_t) d3 << 24 | (uint64_t) d4 << 32 | (uint64_t) d5 << 40 | (uint64_t) d6 << 48 | (uint64_t) d7 << 56; } is correctly optimized to load64bits(unsigned char const*): mov rax, QWORD PTR [rdi] ret however, uint64_t load64bits2(const uint8_t* data, size_t index) { uint8_t d0 =3D data[index++]; uint8_t d1 =3D data[index++]; uint8_t d2 =3D data[index++]; uint8_t d3 =3D data[index++]; uint8_t d4 =3D data[index++]; uint8_t d5 =3D data[index++]; uint8_t d6 =3D data[index++]; uint8_t d7 =3D data[index++]; return (uint64_t) d0 | (uint64_t) d1 << 8 | (uint64_t) d2 << 16 | (uint64_t) d3 << 24 | (uint64_t) d4 << 32 | (uint64_t) d5 << 40 | (uint64_t) d6 << 48 | (uint64_t) d7 << 56; } compiles to load64bits2(unsigned char const*, unsigned long): mov rdx, rsi movzx eax, BYTE PTR [rdi+1+rsi] movzx esi, BYTE PTR [rdi+2+rsi] sal rax, 8 sal rsi, 16 or rax, rsi movzx esi, BYTE PTR [rdi+rdx] or rax, rsi movzx esi, BYTE PTR [rdi+3+rdx] sal rsi, 24 or rax, rsi movzx esi, BYTE PTR [rdi+4+rdx] sal rsi, 32 or rax, rsi movzx esi, BYTE PTR [rdi+5+rdx] sal rsi, 40 or rax, rsi movzx esi, BYTE PTR [rdi+6+rdx] movzx edx, BYTE PTR [rdi+7+rdx] sal rsi, 48 sal rdx, 56 or rax, rsi or rax, rdx ret Clang compiles both to a single mov. This impacts other operations, such as a simple byteswap uint64_t bswap64(const uint8_t* data, size_t index) { uint8_t d0 =3D data[index++]; uint8_t d1 =3D data[index++]; uint8_t d2 =3D data[index++]; uint8_t d3 =3D data[index++]; uint8_t d4 =3D data[index++]; uint8_t d5 =3D data[index++]; uint8_t d6 =3D data[index++]; uint8_t d7 =3D data[index++]; return (uint64_t) d7 | (uint64_t) d6 << 8 | (uint64_t) d5 << 16 | (uint64_t) d4 << 24 | (uint64_t) d3 << 32 | (uint64_t) d2 << 40 | (uint64_t) d1 << 48 | (uint64_t) d0 << 56; } compiling to bswap64(unsigned char const*, unsigned long): mov rdx, rsi movzx eax, BYTE PTR [rdi+6+rsi] movzx esi, BYTE PTR [rdi+5+rsi] sal rax, 8 sal rsi, 16 or rax, rsi movzx esi, BYTE PTR [rdi+7+rdx] or rax, rsi movzx esi, BYTE PTR [rdi+4+rdx] sal rsi, 24 or rax, rsi movzx esi, BYTE PTR [rdi+3+rdx] sal rsi, 32 or rax, rsi movzx esi, BYTE PTR [rdi+2+rdx] sal rsi, 40 or rax, rsi movzx esi, BYTE PTR [rdi+1+rdx] movzx edx, BYTE PTR [rdi+rdx] sal rsi, 48 sal rdx, 56 or rax, rsi or rax, rdx ret instead of bswap64(unsigned char const*, unsigned long): movbe rax, qword ptr [rdi + rsi] ret https://godbolt.org/z/bjxq1rEYY=