From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 7F8A73858C52; Wed, 10 Jan 2024 05:01:36 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7F8A73858C52
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1704862896;
	bh=5tCw5KCuZ+NURuRlSr3hIrVtBsrLcBadx4mj368iAk0=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=keksik2vOZnji2mAwWdGEbvNgrxfC425TioDcFuAnygZOLvKIkfAbP2Hgp2KGGDx9
	 DFvU1jsgmPsQ0+3GHt2WXrV3JGhqfAG2NBO/Ul74vWFGWI2YfN2Jn8AS/8kZk3Hqde
	 EIY9p3oprsmU7HR3jwNqRU6gJWhhuNCxFYJotLc8=
From: "fxue at os dot amperecomputing.com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/113104] Suboptimal loop-based slp node
 splicing across iterations
Date: Wed, 10 Jan 2024 05:01:34 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 14.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: fxue at os dot amperecomputing.com
X-Bugzilla-Status: REOPENED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: rsandifo at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: resolution bug_status
Message-ID: <bug-113104-4-DPCsEtKA3G@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-113104-4@http.gcc.gnu.org/bugzilla/>
References: <bug-113104-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D113104

Feng Xue <fxue at os dot amperecomputing.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         Resolution|FIXED                       |---
             Status|RESOLVED                    |REOPENED
--- Comment #7 from Feng Xue <fxue at os dot amperecomputing.com> ---
Partial permutation (especially extract-low/high) would incur inefficient
splicing in some situation even the vector mode with lowest cost is used.

  int test(unsigned array[4][4]);

  int foo(unsigned short *a, unsigned long n)
  {
    unsigned array[4][4];

    for (unsigned i =3D 0; i < 4; i++, a +=3D n)
      {
        array[i][0] =3D (a[0] << 3) - (a[4] << 6);
        array[i][1] =3D (a[1] << 3) - (a[5] << 6);
        array[i][2] =3D (a[2] << 3) - (a[6] << 6);
        array[i][3] =3D (a[3] << 3) - (a[7] << 6);
      }

    return test(array);
  }

        // After vect-compare-cost fix
        mov     x2, x0
        stp     x29, x30, [sp, -80]!
        add     x3, x2, x1, lsl 1
        lsl     x1, x1, 1
        mov     x29, sp
        add     x4, x3, x1
        add     x0, sp, 16
        ldr     q5, [x2]
        ldr     q31, [x4, x1]
        ldr     q0, [x3, x1]
        ldr     q1, [x2, x1]
        movi    v28.4s, 0                //
        zip1    v29.2d, v0.2d, v31.2d    //
        zip1    v2.2d, v5.2d, v1.2d      //
        zip2    v31.2d, v0.2d, v31.2d    //
        zip2    v1.2d, v5.2d, v1.2d      //=20
        zip1    v30.8h, v29.8h, v28.8h   //
        zip1    v4.8h, v2.8h, v28.8h     //  superfluous
        zip1    v27.8h, v31.8h, v28.8h   //
        zip1    v3.8h, v1.8h, v28.8h     //
        zip2    v29.8h, v29.8h, v28.8h   //
        zip2    v31.8h, v31.8h, v28.8h   //
        zip2    v2.8h, v2.8h, v28.8h     //
        zip2    v1.8h, v1.8h, v28.8h     //
        shl     v30.4s, v30.4s, 3
        shl     v29.4s, v29.4s, 3
        shl     v4.4s, v4.4s, 3
        shl     v2.4s, v2.4s, 3
        shl     v27.4s, v27.4s, 6
        shl     v31.4s, v31.4s, 6
        shl     v3.4s, v3.4s, 6
        shl     v1.4s, v1.4s, 6
        sub     v27.4s, v30.4s, v27.4s
        sub     v31.4s, v29.4s, v31.4s
        sub     v3.4s, v4.4s, v3.4s
        sub     v1.4s, v2.4s, v1.4s
        stp     q27, q31, [sp, 48]
        stp     q3, q1, [sp, 16]
        bl      test
        ldp     x29, x30, [sp], 80
        ret


        // Expect it to be optimized as:
        lsl     x3, x1, 1
        mov     x2, x0
        stp     x29, x30, [sp, -80]!
        add     x1, x2, x1, lsl 1
        add     x4, x1, x3
        mov     x29, sp
        add     x0, sp, 16
        ldr     q30, [x2, x3]
        ldr     q0, [x2]
        ushll   v31.4s, v30.4h, 3
        ushll2  v30.4s, v30.8h, 6
        ushll   v29.4s, v0.4h, 3
        ushll2  v0.4s, v0.8h, 6
        sub     v30.4s, v31.4s, v30.4s
        sub     v0.4s, v29.4s, v0.4s
        str     q0, [sp, 16]
        ldr     q0, [x1, x3]
        str     q30, [sp, 32]
        ldr     q30, [x4, x3]
        ushll   v29.4s, v0.4h, 3
        ushll2  v0.4s, v0.8h, 6
        ushll   v31.4s, v30.4h, 3
        ushll2  v30.4s, v30.8h, 6
        sub     v0.4s, v29.4s, v0.4s
        sub     v30.4s, v31.4s, v30.4s
        stp     q0, q30, [sp, 48]
        bl      test
        ldp     x29, x30, [sp], 80
        ret

Based on cost arising from splicing, we still need a way to select the most
profitable vector mode per slp node, over the global vector mode specified =
in
vinfo.=