From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 3A7463858D20; Fri,  8 Sep 2023 02:40:18 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3A7463858D20
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1694140818;
	bh=Jj6OPsS25ljvFlLD4h3sl4TXiI1hz1cehvNcIxMKOUo=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=LCovxZ2wzeoMxC/r70DvCRxX2JFaeJpVcM798b670vo3CQH8dRs6vNmjDrYDpoq3u
	 88ck/2s3+AIYjKX0Ph1y0dLg5v7Dc1Jz4IkSm3nH62OV/3sd1QdMkAWrTBdxnZfHgb
	 Sme3ulbj87NDiFeN9k93zvEOZNyFwEtJmNsZPfeI=
From: "d_vampile at 163 dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/111332] Using GCC7.3.0 and GCC10.3.0 to compile the same
 test case, assembler file instructions are different and performance fallback
 is obvious.
Date: Fri, 08 Sep 2023 02:40:18 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 10.3.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: d_vampile at 163 dot com
X-Bugzilla-Status: RESOLVED
X-Bugzilla-Resolution: FIXED
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-111332-4-VAoKBtmSP8@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-111332-4@http.gcc.gnu.org/bugzilla/>
References: <bug-111332-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D111332

--- Comment #7 from d_vampile <d_vampile at 163 dot com> ---
(In reply to Andrew Pinski from comment #3)
> GCC 11+ produces:
> .L3:
>         vmovdqu (%rsi), %ymm2
>         vmovdqu 32(%rsi), %ymm1
>         subq    $-128, %rdi
>         subq    $-128, %rsi
>         vmovdqu -64(%rsi), %ymm0
>         vmovdqu -32(%rsi), %ymm3
>         vmovdqu %ymm2, -128(%rdi)
>         vmovdqu %ymm3, -32(%rdi)
>         vmovdqu %ymm1, -96(%rdi)
>         vmovdqu %ymm0, -64(%rdi)
>         cmpq    %rax, %rdi
>         jne     .L3
>=20
> Which is the best code ...

GCC 7.3.0 produces:
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_loadu_si256 (__m256i_u const *__P)
{
  return *__P;
  401170:       c5 fa 6f 1e             vmovdqu (%rsi),%xmm3
                dst =3D (uint8_t *)dst + 128;
  401174:       48 83 ef 80             sub    $0xffffffffffffff80,%rdi
                src =3D (const uint8_t *)src + 128;
  401178:       48 83 ee 80             sub    $0xffffffffffffff80,%rsi
  40117c:       c5 fa 6f 56 a0          vmovdqu -0x60(%rsi),%xmm2
  401181:       c4 e3 65 38 5e 90 01    vinserti128
$0x1,-0x70(%rsi),%ymm3,%ymm3
  401188:       c5 fa 6f 4e c0          vmovdqu -0x40(%rsi),%xmm1
  40118d:       c4 e3 6d 38 56 b0 01    vinserti128
$0x1,-0x50(%rsi),%ymm2,%ymm2
  401194:       c5 fa 6f 46 e0          vmovdqu -0x20(%rsi),%xmm0
  401199:       c4 e3 75 38 4e d0 01    vinserti128
$0x1,-0x30(%rsi),%ymm1,%ymm1
  4011a0:       c4 e3 7d 38 46 f0 01    vinserti128
$0x1,-0x10(%rsi),%ymm0,%ymm0
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
{
  *__P =3D __A;
  4011a7:       c5 f8 11 5f 80          vmovups %xmm3,-0x80(%rdi)
  4011ac:       c4 e3 7d 39 5f 90 01    vextracti128 $0x1,%ymm3,-0x70(%rdi)
  4011b3:       c5 f8 11 57 a0          vmovups %xmm2,-0x60(%rdi)
  4011b8:       c4 e3 7d 39 57 b0 01    vextracti128 $0x1,%ymm2,-0x50(%rdi)
  4011bf:       c5 f8 11 4f c0          vmovups %xmm1,-0x40(%rdi)
  4011c4:       c4 e3 7d 39 4f d0 01    vextracti128 $0x1,%ymm1,-0x30(%rdi)
  4011cb:       c5 f8 11 47 e0          vmovups %xmm0,-0x20(%rdi)
  4011d0:       c4 e3 7d 39 47 f0 01    vextracti128 $0x1,%ymm0,-0x10(%rdi)
        while (n >=3D 128) {
  4011d7:       48 39 c7                cmp    %rax,%rdi
  4011da:       75 94                   jne    401170 <rte_mov128blocks+0x2=
0>
  4011dc:       c5 f8 77                vzeroupper

In terms of runtime, this code is the best.=