[Bug tree-optimization/113104] Suboptimal loop-based slp node splicing across iterations

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

From: "fxue at os dot amperecomputing.com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/113104] Suboptimal loop-based slp node splicing across iterations
Date: Wed, 10 Jan 2024 05:01:34 +0000	[thread overview]
Message-ID: <bug-113104-4-DPCsEtKA3G@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-113104-4@http.gcc.gnu.org/bugzilla/>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113104

Feng Xue <fxue at os dot amperecomputing.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         Resolution|FIXED                       |---
             Status|RESOLVED                    |REOPENED

--- Comment #7 from Feng Xue <fxue at os dot amperecomputing.com> ---
Partial permutation (especially extract-low/high) would incur inefficient
splicing in some situation even the vector mode with lowest cost is used.

  int test(unsigned array[4][4]);

  int foo(unsigned short *a, unsigned long n)
  {
    unsigned array[4][4];

    for (unsigned i = 0; i < 4; i++, a += n)
      {
        array[i][0] = (a[0] << 3) - (a[4] << 6);
        array[i][1] = (a[1] << 3) - (a[5] << 6);
        array[i][2] = (a[2] << 3) - (a[6] << 6);
        array[i][3] = (a[3] << 3) - (a[7] << 6);
      }

    return test(array);
  }

        // After vect-compare-cost fix
        mov     x2, x0
        stp     x29, x30, [sp, -80]!
        add     x3, x2, x1, lsl 1
        lsl     x1, x1, 1
        mov     x29, sp
        add     x4, x3, x1
        add     x0, sp, 16
        ldr     q5, [x2]
        ldr     q31, [x4, x1]
        ldr     q0, [x3, x1]
        ldr     q1, [x2, x1]
        movi    v28.4s, 0                //
        zip1    v29.2d, v0.2d, v31.2d    //
        zip1    v2.2d, v5.2d, v1.2d      //
        zip2    v31.2d, v0.2d, v31.2d    //
        zip2    v1.2d, v5.2d, v1.2d      // 
        zip1    v30.8h, v29.8h, v28.8h   //
        zip1    v4.8h, v2.8h, v28.8h     //  superfluous
        zip1    v27.8h, v31.8h, v28.8h   //
        zip1    v3.8h, v1.8h, v28.8h     //
        zip2    v29.8h, v29.8h, v28.8h   //
        zip2    v31.8h, v31.8h, v28.8h   //
        zip2    v2.8h, v2.8h, v28.8h     //
        zip2    v1.8h, v1.8h, v28.8h     //
        shl     v30.4s, v30.4s, 3
        shl     v29.4s, v29.4s, 3
        shl     v4.4s, v4.4s, 3
        shl     v2.4s, v2.4s, 3
        shl     v27.4s, v27.4s, 6
        shl     v31.4s, v31.4s, 6
        shl     v3.4s, v3.4s, 6
        shl     v1.4s, v1.4s, 6
        sub     v27.4s, v30.4s, v27.4s
        sub     v31.4s, v29.4s, v31.4s
        sub     v3.4s, v4.4s, v3.4s
        sub     v1.4s, v2.4s, v1.4s
        stp     q27, q31, [sp, 48]
        stp     q3, q1, [sp, 16]
        bl      test
        ldp     x29, x30, [sp], 80
        ret


        // Expect it to be optimized as:
        lsl     x3, x1, 1
        mov     x2, x0
        stp     x29, x30, [sp, -80]!
        add     x1, x2, x1, lsl 1
        add     x4, x1, x3
        mov     x29, sp
        add     x0, sp, 16
        ldr     q30, [x2, x3]
        ldr     q0, [x2]
        ushll   v31.4s, v30.4h, 3
        ushll2  v30.4s, v30.8h, 6
        ushll   v29.4s, v0.4h, 3
        ushll2  v0.4s, v0.8h, 6
        sub     v30.4s, v31.4s, v30.4s
        sub     v0.4s, v29.4s, v0.4s
        str     q0, [sp, 16]
        ldr     q0, [x1, x3]
        str     q30, [sp, 32]
        ldr     q30, [x4, x3]
        ushll   v29.4s, v0.4h, 3
        ushll2  v0.4s, v0.8h, 6
        ushll   v31.4s, v30.4h, 3
        ushll2  v30.4s, v30.8h, 6
        sub     v0.4s, v29.4s, v0.4s
        sub     v30.4s, v31.4s, v30.4s
        stp     q0, q30, [sp, 48]
        bl      test
        ldp     x29, x30, [sp], 80
        ret

Based on cost arising from splicing, we still need a way to select the most
profitable vector mode per slp node, over the global vector mode specified in
vinfo.

     prev parent reply	other threads:[~2024-01-10  5:01 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-21  8:22 [Bug tree-optimization/113104] New: " fxue at os dot amperecomputing.com
2023-12-21  9:07 ` [Bug tree-optimization/113104] " rguenth at gcc dot gnu.org
2023-12-21  9:33 ` fxue at os dot amperecomputing.com
2023-12-21  9:41 ` rguenther at suse dot de
2023-12-30 12:35 ` rsandifo at gcc dot gnu.org
2024-01-05 16:25 ` cvs-commit at gcc dot gnu.org
2024-01-05 16:32 ` rsandifo at gcc dot gnu.org
2024-01-10  5:01 ` fxue at os dot amperecomputing.com [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-113104-4-DPCsEtKA3G@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).