From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 3A7463858D20; Fri, 8 Sep 2023 02:40:18 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3A7463858D20 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1694140818; bh=Jj6OPsS25ljvFlLD4h3sl4TXiI1hz1cehvNcIxMKOUo=; h=From:To:Subject:Date:In-Reply-To:References:From; b=LCovxZ2wzeoMxC/r70DvCRxX2JFaeJpVcM798b670vo3CQH8dRs6vNmjDrYDpoq3u 88ck/2s3+AIYjKX0Ph1y0dLg5v7Dc1Jz4IkSm3nH62OV/3sd1QdMkAWrTBdxnZfHgb Sme3ulbj87NDiFeN9k93zvEOZNyFwEtJmNsZPfeI= From: "d_vampile at 163 dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/111332] Using GCC7.3.0 and GCC10.3.0 to compile the same test case, assembler file instructions are different and performance fallback is obvious. Date: Fri, 08 Sep 2023 02:40:18 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 10.3.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: d_vampile at 163 dot com X-Bugzilla-Status: RESOLVED X-Bugzilla-Resolution: FIXED X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D111332 --- Comment #7 from d_vampile --- (In reply to Andrew Pinski from comment #3) > GCC 11+ produces: > .L3: > vmovdqu (%rsi), %ymm2 > vmovdqu 32(%rsi), %ymm1 > subq $-128, %rdi > subq $-128, %rsi > vmovdqu -64(%rsi), %ymm0 > vmovdqu -32(%rsi), %ymm3 > vmovdqu %ymm2, -128(%rdi) > vmovdqu %ymm3, -32(%rdi) > vmovdqu %ymm1, -96(%rdi) > vmovdqu %ymm0, -64(%rdi) > cmpq %rax, %rdi > jne .L3 >=20 > Which is the best code ... GCC 7.3.0 produces: extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_loadu_si256 (__m256i_u const *__P) { return *__P; 401170: c5 fa 6f 1e vmovdqu (%rsi),%xmm3 dst =3D (uint8_t *)dst + 128; 401174: 48 83 ef 80 sub $0xffffffffffffff80,%rdi src =3D (const uint8_t *)src + 128; 401178: 48 83 ee 80 sub $0xffffffffffffff80,%rsi 40117c: c5 fa 6f 56 a0 vmovdqu -0x60(%rsi),%xmm2 401181: c4 e3 65 38 5e 90 01 vinserti128 $0x1,-0x70(%rsi),%ymm3,%ymm3 401188: c5 fa 6f 4e c0 vmovdqu -0x40(%rsi),%xmm1 40118d: c4 e3 6d 38 56 b0 01 vinserti128 $0x1,-0x50(%rsi),%ymm2,%ymm2 401194: c5 fa 6f 46 e0 vmovdqu -0x20(%rsi),%xmm0 401199: c4 e3 75 38 4e d0 01 vinserti128 $0x1,-0x30(%rsi),%ymm1,%ymm1 4011a0: c4 e3 7d 38 46 f0 01 vinserti128 $0x1,-0x10(%rsi),%ymm0,%ymm0 } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_storeu_si256 (__m256i_u *__P, __m256i __A) { *__P =3D __A; 4011a7: c5 f8 11 5f 80 vmovups %xmm3,-0x80(%rdi) 4011ac: c4 e3 7d 39 5f 90 01 vextracti128 $0x1,%ymm3,-0x70(%rdi) 4011b3: c5 f8 11 57 a0 vmovups %xmm2,-0x60(%rdi) 4011b8: c4 e3 7d 39 57 b0 01 vextracti128 $0x1,%ymm2,-0x50(%rdi) 4011bf: c5 f8 11 4f c0 vmovups %xmm1,-0x40(%rdi) 4011c4: c4 e3 7d 39 4f d0 01 vextracti128 $0x1,%ymm1,-0x30(%rdi) 4011cb: c5 f8 11 47 e0 vmovups %xmm0,-0x20(%rdi) 4011d0: c4 e3 7d 39 47 f0 01 vextracti128 $0x1,%ymm0,-0x10(%rdi) while (n >=3D 128) { 4011d7: 48 39 c7 cmp %rax,%rdi 4011da: 75 94 jne 401170 4011dc: c5 f8 77 vzeroupper In terms of runtime, this code is the best.=