From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 88C063858D28; Fri, 10 Feb 2023 10:00:32 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 88C063858D28 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1676023232; bh=UXFU529wJ58bwNWR7HRS40wngRaYBJPoO/lsw0m9JXE=; h=From:To:Subject:Date:In-Reply-To:References:From; b=ZmJqZ+AM01RWuDqfvcBdfMuLjKQzbsSw39fWswgN01/Q+fiC9h2f55dz/O/jtqGP8 akgw7xjANo2Jq6Pg7P6pwIS6oqwxgsedN2uj849lGi2xSuZVLuNQs7CoR35k8lE/mx Arc7APxw9hgWFm3wk30nwGm2Fvfd/95f65UX4iIs= From: "rguenth at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/108724] Poor codegen when summing two arrays without AVX or SSE Date: Fri, 10 Feb 2023 10:00:29 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: 13.0 X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: normal X-Bugzilla-Who: rguenth at gcc dot gnu.org X-Bugzilla-Status: ASSIGNED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: rguenth at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: see_also Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D108724 Richard Biener changed: What |Removed |Added ---------------------------------------------------------------------------- See Also| |https://gcc.gnu.org/bugzill | |a/show_bug.cgi?id=3D101801 --- Comment #4 from Richard Biener --- So the vectorizer thinks that foo: .LFB0: .cfi_startproc movabsq $9223372034707292159, %rax movq %rdi, %rcx movq (%rdx), %r10 movq (%rsi), %rdi movq %rax, %r8 movq %rax, %r9 movq %rax, %r11 andq %r10, %r8 andq %rdi, %r9 addq %r8, %r9 movq %rdi, %r8 movabsq $-9223372034707292160, %rdi xorq %r10, %r8 movq 8(%rdx), %r10 andq %rdi, %r8 xorq %r9, %r8 movq %rax, %r9 movq %r8, (%rcx) movq 8(%rsi), %r8 andq %r10, %r9 andq %r8, %r11 xorq %r10, %r8 movq 16(%rdx), %r10 addq %r11, %r9 andq %rdi, %r8 movq %rax, %r11 xorq %r9, %r8 movq %rax, %r9 andq %r10, %r11 movq %r8, 8(%rcx) movq 16(%rsi), %r8 andq %r8, %r9 xorq %r10, %r8 movq 24(%rdx), %r10 addq %r11, %r9 andq %rdi, %r8 movq %rax, %r11 xorq %r9, %r8 movq %rax, %r9 andq %r10, %r11 movq %r8, 16(%rcx) movq 24(%rsi), %r8 andq %r8, %r9 xorq %r10, %r8 movq 32(%rdx), %r10 addq %r11, %r9 andq %rdi, %r8 movq %rax, %r11 xorq %r9, %r8 movq %rax, %r9 andq %r10, %r11 movq %r8, 24(%rcx) movq 32(%rsi), %r8 andq %r8, %r9 xorq %r10, %r8 movq 40(%rdx), %r10 addq %r11, %r9 andq %rdi, %r8 movq %rax, %r11 xorq %r9, %r8 movq %rax, %r9 andq %r10, %r11 movq %r8, 32(%rcx) movq 40(%rsi), %r8 andq %r8, %r9 addq %r11, %r9 xorq %r10, %r8 movq 48(%rsi), %r10 movq %rax, %r11 andq %rdi, %r8 xorq %r9, %r8 movq %rax, %r9 andq %r10, %r11 movq %r8, 40(%rcx) movq 48(%rdx), %r8 movq 56(%rdx), %rdx andq %r8, %r9 xorq %r10, %r8 addq %r11, %r9 andq %rdi, %r8 xorq %r9, %r8 movq %r8, 48(%rcx) movq 56(%rsi), %r8 movq %rax, %rsi andq %rdx, %rsi andq %r8, %rax xorq %r8, %rdx addq %rsi, %rax andq %rdi, %rdx xorq %rdx, %rax movq %rax, 56(%rcx) ret will be faster than when not vectorizing. Not vectorizing produces foo: .LFB0: .cfi_startproc movq %rsi, %rcx movl (%rsi), %esi addl (%rdx), %esi movl %esi, (%rdi) movl 4(%rdx), %esi addl 4(%rcx), %esi movl %esi, 4(%rdi) movl 8(%rdx), %esi addl 8(%rcx), %esi movl %esi, 8(%rdi) movl 12(%rdx), %esi addl 12(%rcx), %esi movl %esi, 12(%rdi) movl 16(%rdx), %esi addl 16(%rcx), %esi movl %esi, 16(%rdi) movl 20(%rdx), %esi addl 20(%rcx), %esi movl %esi, 20(%rdi) movl 24(%rdx), %esi addl 24(%rcx), %esi movl %esi, 24(%rdi) movl 28(%rdx), %esi addl 28(%rcx), %esi movl %esi, 28(%rdi) movl 32(%rdx), %esi addl 32(%rcx), %esi movl %esi, 32(%rdi) movl 36(%rdx), %esi addl 36(%rcx), %esi movl %esi, 36(%rdi) movl 40(%rdx), %esi addl 40(%rcx), %esi movl %esi, 40(%rdi) movl 44(%rdx), %esi addl 44(%rcx), %esi movl %esi, 44(%rdi) movl 48(%rdx), %esi addl 48(%rcx), %esi movl %esi, 48(%rdi) movl 52(%rdx), %esi addl 52(%rcx), %esi movl %esi, 52(%rdi) movl 56(%rdx), %esi movl 60(%rdx), %edx addl 56(%rcx), %esi addl 60(%rcx), %edx movl %esi, 56(%rdi) movl %edx, 60(%rdi) ret The vectorizer produces un-lowered vector adds which is good in case follow= up optimizations are possible (the ops are not obfuscated), but also bad because unrolling estimates the size in a wrong way. Costs go *_3 1 times scalar_load costs 12 in prologue *_5 1 times scalar_load costs 12 in prologue=20 _4 + _6 1 times scalar_stmt costs 4 in prologue _8 1 times scalar_store costs 12 in prologue=20 vs *_3 1 times unaligned_load (misalign -1) costs 12 in body *_5 1 times unaligned_load (misalign -1) costs 12 in body _4 + _6 1 times vector_stmt costs 4 in body _4 + _6 5 times scalar_stmt costs 20 in body _8 1 times unaligned_store (misalign -1) costs 12 in body and as usual the savings from the wide loads and store outweight the loss on the arithmetic side, but it's a close call: Vector inside of loop cost: 60 Scalar iteration cost: 40 so 2*40 > 60 and the vectorization is profitable. The easiest fix is to avoid vectorizing, another possibility is to adhere to the vectorizers decision and expand to the lowered sequence immediately from within the vectorizer itself. The original goal was to remove this hard cap on the number of elements but this bug shows that work was incomplete. I'm going to re-instantiate the hard cap and revisit for GCC 14.=