public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug middle-end/106081] New: missed vectorization
Date: Fri, 24 Jun 2022 15:56:40 +0000	[thread overview]
Message-ID: <bug-106081-4@http.gcc.gnu.org/bugzilla/> (raw)

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106081

            Bug ID: 106081
           Summary: missed vectorization
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

This testcase (derived from ImageMagick)

struct pixels
{
        short a,b,c,d;
} *pixels;
struct dpixels
{
        double a,b,c,d;
};

double
test(double *k)
{
        struct dpixels results={};
        for (int u=0; u<10000;u++,k--)
        {
                results.a += *k*pixels[u].a;
                results.b += *k*pixels[u].b;
                results.c += *k*pixels[u].c;
                results.d += *k*pixels[u].d;
        }
        return results.a+results.b*2+results.c*3+results.d*4;
}

gets vectorized by clang:
test:                                   # @test
        .cfi_startproc
# %bb.0:
        movq    pixels(%rip), %rax
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %ecx, %ecx
        .p2align        4, 0x90
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vpmovsxwd       (%rax), %xmm1
        vbroadcastsd    (%rdi,%rcx,8), %ymm2
        addq    $8, %rax
        decq    %rcx
        vcvtdq2pd       %xmm1, %ymm1
        vfmadd231pd     %ymm2, %ymm1, %ymm0     # ymm0 = (ymm1 * ymm2) + ymm0
        cmpq    $-10000, %rcx                   # imm = 0xD8F0
        jne     .LBB0_1
# %bb.2:
        vpermilpd       $1, %xmm0, %xmm1        # xmm1 = xmm0[1,0]
        vfmadd132sd     .LCPI0_0(%rip), %xmm0, %xmm1 # xmm1 = (xmm1 * mem) +
xmm0
        vextractf128    $1, %ymm0, %xmm0
        vfmadd231sd     .LCPI0_1(%rip), %xmm0, %xmm1 # xmm1 = (xmm0 * mem) +
xmm1
        vpermilpd       $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0]
        vfmadd132sd     .LCPI0_2(%rip), %xmm1, %xmm0 # xmm0 = (xmm0 * mem) +
xmm1
        vzeroupper
        retq

but not by GCC.
Original loop is:
    0.94 :   423cb0: vmovdqu (%rsi,%rdi,8),%xmm5 // morphology.c:2984
         : 2983   if ( IsNaN(*k) ) continue;
    0.29 :   423cb5: vpermilpd $0x1,(%rcx),%xmm4
         : 2982   for (u=0; u < (ssize_t) kernel->width; u++, k--) {
    0.46 :   423cbb: add    $0x2,%rdi
    0.07 :   423cbf: add    $0xfffffffffffffff0,%rcx
         : 2984   result.red     += (*k)*k_pixels[u].red;
    0.03 :   423cc3: vpshufb %xmm12,%xmm5,%xmm6
    6.81 :   423cc8: vcvtdq2pd %xmm6,%xmm6
   13.05 :   423ccc: vfmadd231pd %xmm6,%xmm4,%xmm1
         : 2985   result.green   += (*k)*k_pixels[u].green;
   17.45 :   423cd1: vpshufb %xmm15,%xmm5,%xmm6 // morphology.c:2985
    0.33 :   423cd6: vcvtdq2pd %xmm6,%xmm6
    0.00 :   423cda: vfmadd231pd %xmm6,%xmm4,%xmm3
         : 2986   result.blue    += (*k)*k_pixels[u].blue;
   15.28 :   423cdf: vpshufb %xmm13,%xmm5,%xmm6 // morphology.c:2986
         : 2987   result.opacity += (*k)*k_pixels[u].opacity;
    0.00 :   423ce4: vpshufb %xmm8,%xmm5,%xmm5
         : 2986   result.blue    += (*k)*k_pixels[u].blue;
    0.00 :   423ce9: vcvtdq2pd %xmm6,%xmm6
         : 2987   result.opacity += (*k)*k_pixels[u].opacity;
    0.21 :   423ced: vcvtdq2pd %xmm5,%xmm5
         : 2986   result.blue    += (*k)*k_pixels[u].blue;
    0.97 :   423cf1: vfmadd231pd %xmm6,%xmm4,%xmm0
         : 2987   result.opacity += (*k)*k_pixels[u].opacity;
   19.16 :   423cf6: vfmadd231pd %xmm5,%xmm4,%xmm2 // morphology.c:2987
         : 2982   for (u=0; u < (ssize_t) kernel->width; u++, k--) {
   14.51 :   423cfb: cmp    %rdi,%rbp // morphology.c:2982
    0.00 :   423cfe: jne    423cb0 <MorphologyApply.6136+0x20c0>

Changing short to double makes it vectorized:
.L2:
        vmovupd (%rax), %ymm4
        vmovupd 64(%rax), %ymm2
        subq    $-128, %rax
        subq    $32, %rdx
        vunpcklpd       -96(%rax), %ymm4, %ymm1
        vunpckhpd       -96(%rax), %ymm4, %ymm0
        vmovupd -64(%rax), %ymm4
        vunpckhpd       -32(%rax), %ymm2, %ymm2
        vunpcklpd       -32(%rax), %ymm4, %ymm4
        vpermpd $27, 32(%rdx), %ymm3
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm0, %ymm0
        vpermpd $216, %ymm2, %ymm2
        vpermpd $216, %ymm4, %ymm4
        vunpcklpd       %ymm2, %ymm0, %ymm10
        vunpckhpd       %ymm2, %ymm0, %ymm0
        vunpckhpd       %ymm4, %ymm1, %ymm9
        vunpcklpd       %ymm4, %ymm1, %ymm1
        vpermpd $216, %ymm10, %ymm10
        vpermpd $216, %ymm0, %ymm0
        vfmadd231pd     %ymm3, %ymm10, %ymm6
        vfmadd231pd     %ymm3, %ymm0, %ymm8
        vpermpd $216, %ymm9, %ymm9
        vpermpd $216, %ymm1, %ymm1
        vfmadd231pd     %ymm3, %ymm1, %ymm5
        vfmadd231pd     %ymm3, %ymm9, %ymm7
        cmpq    %rax, %rcx
        jne     .L2

howver clang's code looks shorter:
LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vbroadcastsd    (%rdi,%rcx,8), %ymm1
        vfmadd231pd     (%rax), %ymm1, %ymm0    # ymm0 = (ymm1 * mem) + ymm0
        addq    $32, %rax
        decq    %rcx
        cmpq    $-10000, %rcx                   # imm = 0xD8F0
        jne     .LBB0_1

We loop vectorize while clang slp vectorizes it seems.

             reply	other threads:[~2022-06-24 15:56 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-24 15:56 hubicka at gcc dot gnu.org [this message]
2022-06-24 16:14 ` [Bug middle-end/106081] " hubicka at gcc dot gnu.org
2022-06-26  8:08 ` crazylht at gmail dot com
2022-06-27  9:01 ` marxin at gcc dot gnu.org
2022-06-27 10:38 ` rguenth at gcc dot gnu.org
2022-06-27 14:26 ` hubicka at gcc dot gnu.org
2022-10-24  0:14 ` pinskia at gcc dot gnu.org
2023-06-21 13:45 ` rguenth at gcc dot gnu.org
2023-06-27  7:49 ` rguenth at gcc dot gnu.org
2023-06-27  8:13 ` rguenth at gcc dot gnu.org
2023-06-27  9:10 ` rsandifo at gcc dot gnu.org
2023-06-28 12:15 ` hubicka at gcc dot gnu.org
2023-07-26  9:34 ` rguenth at gcc dot gnu.org
2023-07-26  9:35 ` rguenth at gcc dot gnu.org
2023-07-26 10:01 ` rsandifo at gcc dot gnu.org
2023-07-26 10:32 ` rguenth at gcc dot gnu.org
2023-07-26 10:52 ` rguenth at gcc dot gnu.org
2023-07-26 11:00 ` rsandifo at gcc dot gnu.org
2023-07-26 11:15 ` rsandifo at gcc dot gnu.org
2023-07-26 12:15 ` rguenther at suse dot de
2023-07-26 13:28 ` cvs-commit at gcc dot gnu.org
2023-07-26 13:28 ` rguenth at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-106081-4@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).