From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id AC1DB3858C78; Wed, 24 May 2023 07:55:26 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org AC1DB3858C78
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1684914926;
	bh=pcjWJ1AFWw8WqXSr+3f+WmM1wSzAqsY9GAhs76QOkvQ=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=Ix+sbPTUKy/kKOID1+7nrbwjWBX1uMYfL+79DQkN4KVjgAcO/OH0/YlaW33EIOBK5
	 ns3ExVVQlguCNTGqQtgggp6kPEGsYitoU8wpiON+zvBnkrG53v6YZtle8kx+moI4g1
	 +P/PWqQY/6WiMzig15iLEAYSZyi5esjD+/pyiw5o=
From: "rguenther at suse dot de" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/109944] vector CTOR with byte elements and SSE2 has STLF
 fail
Date: Wed, 24 May 2023 07:55:26 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 13.0
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: rguenther at suse dot de
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-109944-4-QRJ35QKkk7@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-109944-4@http.gcc.gnu.org/bugzilla/>
References: <bug-109944-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109944

--- Comment #3 from rguenther at suse dot de <rguenther at suse dot de> ---
On Wed, 24 May 2023, crazylht at gmail dot com wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109944
>=20
> --- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
> > I think we can go and for a generic V16QImode CTOR and SSE2 create two
> > V8HImode vectors using pinsrw, for the first from zero-extended QImode
> > values of the even elements and for the second from zero-extended and
> > left-shifted values of the odd elements and then IOR the two vectors.
>=20
> Or the backend can recognize as as a HImode(b,c) broadcast + HImode(d,e)
> vec_set(the middle end can recognize it as VEC_DUPLICATE_EXPR +
> .VEC_SET/BIT_INSERT_EXPR when available?)

Yeah.  Note we need to handle the general case with 16 distinct
elements as well.  For simplicity I'm using a memory input below
instead of 16 function parameters.

void foo (char * __restrict a, char *b)
{
  a[0] =3D b[0];
  a[1] =3D b[16];
  a[2] =3D b[32];
  a[3] =3D b[48];
  a[4] =3D b[64];
  a[5] =3D b[80];
  a[6] =3D b[96];
  a[7] =3D b[112];
  a[8] =3D b[128];
  a[9] =3D b[144];
  a[10] =3D b[160];
  a[11] =3D b[176];
  a[12] =3D b[192];
  a[13] =3D b[208];
  a[14] =3D b[224];
  a[15] =3D b[240];
}

with -O2 generates

foo:
.LFB0:
        .cfi_startproc
        movzbl  112(%rsi), %edx
        movzbl  96(%rsi), %eax
        movzbl  224(%rsi), %r8d
        movzbl  (%rsi), %ecx
        salq    $8, %rdx
        orq     %rax, %rdx
        movzbl  80(%rsi), %eax
        salq    $8, %rdx
        orq     %rax, %rdx
        movzbl  64(%rsi), %eax
... more of that ...
        orq     %r8, %rax
        movzbl  144(%rsi), %r8d
        movzbl  128(%rsi), %esi
        salq    $8, %rax
        orq     %r8, %rax
        salq    $8, %rax
        orq     %rsi, %rax
        movq    %rax, -16(%rsp)
        movdqa  -24(%rsp), %xmm0
        movups  %xmm0, (%rdi)
        ret

so a way is to form HImode elements in GPRs by shift and or
and then build a V8HImode vector from that.  Note
code generation for a V8HImode CTOR also looks imperfect
(change char to short and only do elements 0 to 7 above):

foo:
.LFB0:
        .cfi_startproc
        movzwl  (%rsi), %eax
        movd    %eax, %xmm0
        movzwl  64(%rsi), %eax
        pinsrw  $1, 32(%rsi), %xmm0
        movd    %eax, %xmm3
        movzwl  128(%rsi), %eax
        pinsrw  $1, 96(%rsi), %xmm3
        movd    %eax, %xmm1
        movzwl  192(%rsi), %eax
        punpckldq       %xmm3, %xmm0
        pinsrw  $1, 160(%rsi), %xmm1
        movd    %eax, %xmm2
        pinsrw  $1, 224(%rsi), %xmm2
        punpckldq       %xmm2, %xmm1
        punpcklqdq      %xmm1, %xmm0
        movups  %xmm0, (%rdi)
        ret

so we're building SImode elements in %xmm regs and then
unpack them - that's probably better than a series of
pinsrw due to dependences.  For uarchs where grp->xmm
moves are costly it might be better to do

  pxor %xmm0, %xmm0
  pinsrw $0, (%rsi), %xmm0
  pinsrw $1, 32(%rsi), %xmm0

though?  pinsr* are not especially fast (2uops on zen,
latency 3, throughput 1 - on zen4 it got worse), so maybe
forming SImode in GPRs might be better for them (or mixing
both to better utilize execution resources).  For the 16
or 8 (distinct) element CTORs there's hardly surrounding
code we can hope to execute in parallel.  I wonder if we
can easily determine in the expander whether we deal with
elements that are nicely available in GPRs or in XMMs
or whether we need to deal with wrong choices later,
for example in STV.

But of course first we need to avoid spill/reload.
That's from ix86_expand_vector_init_general doing

      else if (n_words =3D=3D 2)
        {
          rtx tmp =3D gen_reg_rtx (mode);
          emit_clobber (tmp);
          emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
          emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
          emit_move_insn (target, tmp);

which generates (subreg:DI (reg:V16QI 146) 0) and
(subreg:DI (reg:V16QI 146) 8) and then

(insn 52 51 53 2 (parallel [
            (set (subreg:DI (reg:V16QI 146) 0)
                (ior:DI (reg:DI 122)=20
                    (reg:DI 121 [ *b_18(D) ])))
            (clobber (reg:CC 17 flags))
        ]) "t.c":3:8 612 {*iordi_1}
     (expr_list:REG_DEAD (reg:DI 122)
        (expr_list:REG_DEAD (reg:DI 121 [ *b_18(D) ])
            (expr_list:REG_UNUSED (reg:CC 17 flags)
                (nil)))))
(insn 53 52 55 2 (parallel [
            (set (subreg:DI (reg:V16QI 146) 8)
                (ior:DI (reg:DI 144)
                    (reg:DI 143 [ MEM[(char *)b_18(D) + 128B] ])))
            (clobber (reg:CC 17 flags))
        ]) "t.c":3:8 612 {*iordi_1}=20
     (expr_list:REG_DEAD (reg:DI 144)
        (expr_list:REG_DEAD (reg:DI 143 [ MEM[(char *)b_18(D) + 128B] ])
            (expr_list:REG_UNUSED (reg:CC 17 flags)
                (nil)))))=20

which makes LRA spill.  Doing like n_words =3D=3D 4 and dispatching to
ix86_expand_vector_init_general avoids this and code generates

        movq    %rax, %xmm0
...
        movq    %rdx, %xmm1
        punpcklqdq      %xmm1, %xmm0
        movups  %xmm0, (%rdi)
diff --git a/gcc/config/i386/i386-expand.cc=20
b/gcc/config/i386/i386-expand.cc
index ff3d382f1b4..70754d8f710 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -16367,11 +16367,11 @@ quarter:
        emit_move_insn (target, gen_lowpart (mode, words[0]));
       else if (n_words =3D=3D 2)
        {
-         rtx tmp =3D gen_reg_rtx (mode);
-         emit_clobber (tmp);
-         emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
-         emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
-         emit_move_insn (target, tmp);
+         rtx tmp =3D gen_reg_rtx (V2DImode);
+         gcc_assert (tmp_mode =3D=3D DImode);
+         vals =3D gen_rtx_PARALLEL (V2DImode, gen_rtvec_v (2, words));
+         ix86_expand_vector_init_general (false, V2DImode, tmp, vals);
+         emit_move_insn (target, gen_lowpart (mode, tmp));
        }
       else if (n_words =3D=3D 4)
        {=