[Bug target/109812] GraphicsMagick resize is a lot slower in GCC 13.1 vs Clang 16 on Intel Raptor Lake

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/109812] GraphicsMagick resize is a lot slower in GCC 13.1 vs Clang 16 on Intel Raptor Lake
Date: Sun, 28 May 2023 18:11:02 +0000	[thread overview]
Message-ID: <bug-109812-4-AAH1RonwxN@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-109812-4@http.gcc.gnu.org/bugzilla/>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109812

--- Comment #9 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Oddly enough simplified version of the loop SLP vectorizes for me:
struct rgb {unsigned char r,g,b;} *rgbs;
int *addr;
double *weights;
struct drgb {double r,g,b;};

struct drgb sum()
{
        struct drgb r;
        for (int i = 0; i < 100000; i++)
        {
          int j = addr[i];
          double w = weights[i];
          r.r += rgbs[j].r * w;
          r.g += rgbs[j].g * w;
          r.b += rgbs[j].b * w;
        }
        return r;
}
I get:
L2:
        movslq  (%r9,%rdx,4), %rax
        vmovsd  (%r8,%rdx,8), %xmm1
        incq    %rdx
        leaq    (%rax,%rax,2), %rax
        addq    %rsi, %rax
        movzbl  (%rax), %ecx
        vmovddup        %xmm1, %xmm4
        vmovd   %ecx, %xmm0
        movzbl  1(%rax), %ecx
        movzbl  2(%rax), %eax
        vpinsrd $1, %ecx, %xmm0, %xmm0
        vcvtdq2pd       %xmm0, %xmm0
        vfmadd231pd     %xmm4, %xmm0, %xmm2
        vcvtsi2sdl      %eax, %xmm5, %xmm0
        vfmadd231sd     %xmm1, %xmm0, %xmm3
        cmpq    $100000, %rdx
        jne     .L2


I think the actual loop is:
 <bb 53> [local count: 44202554]:
  _106 = _262->pixel;
  _109 = *source_231(D).columns;

  <bb 54> [local count: 401841405]:
  # pixel$green_332 = PHI <_124(89), pixel$green_265(53)>
  # i_357 = PHI <i_298(89), 0(53)>
  # pixel$red_371 = PHI <_119(89), pixel$red_263(53)>
  # pixel$blue_377 = PHI <_129(89), pixel$blue_267(53)>
  i.51_102 = (long unsigned int) i_357;
  _103 = i.51_102 * 16;
  _104 = _262 + _103;
  _105 = _104->pixel;
  _107 = _105 - _106;
  _108 = (long unsigned int) _107;
  _110 = _108 * _109;
  _112 = _110 + _621;
  weight_297 = _104->weight;
  _113 = _112 * 4;
  _114 = _276 + _113;
  _115 = _114->red;
  _116 = (int) _115;
  _117 = (double) _116;
  _118 = _117 * weight_297;
  _119 = _118 + pixel$red_371;
  _120 = _114->green;
 _121 = (int) _120;
  _122 = (double) _121;
  _123 = _122 * weight_297;
  _124 = _123 + pixel$green_332;
  _125 = _114->blue;
  _126 = (int) _125;
  _127 = (double) _126;
  _128 = _127 * weight_297;
  _129 = _128 + pixel$blue_377;
  i_298 = i_357 + 1;
  if (n_195 > i_298)
    goto <bb 89>; [89.00%]
  else
    goto <bb 118>; [11.00%]

  <bb 118> [local count: 44202554]:
  # _607 = PHI <_124(54)>
  # _606 = PHI <_119(54)>
  # _605 = PHI <_129(54)>
  goto <bb 55>; [100.00%]

  <bb 89> [local count: 357638851]:
  goto <bb 54>; [100.00%]


and SLP vectorizer seems to claim:
../magick/resize.c:1284:52: note:       _125 = _114->blue;
../magick/resize.c:1284:52: note:       _120 = _114->green;
../magick/resize.c:1284:52: note:       _115 = _114->red;
../magick/resize.c:1284:52: missed:   not consecutive access weight_297 =
_104->weight;
../magick/resize.c:1284:52: missed:   not consecutive access _105 =
_104->pixel;
../magick/resize.c:1284:52: missed:   not consecutive access _134->red =
iftmp.57_207;
../magick/resize.c:1284:52: missed:   not consecutive access _134->green =
iftmp.60_208;
../magick/resize.c:1284:52: missed:   not consecutive access _134->blue =
iftmp.63_209;
../magick/resize.c:1284:52: missed:   not consecutive access _134->opacity = 0;
../magick/resize.c:1284:52: missed:   not consecutive access _63 =
*source_231(D).columns;
../magick/resize.c:1284:52: missed:   not consecutive access _60 = _262->pixel;

Not sure if that is related to the real testcase:


struct rgb {unsigned char r,g,b;} *rgbs;
int *addr;
double *weights;
struct drgb {double r,g,b,o;};

struct drgb sum()
{
        struct drgb r;
        for (int i = 0; i < 100000; i++)
        {
          int j = addr[i];
          double w = weights[i];
          r.r += rgbs[j].r * w;
          r.g += rgbs[j].g * w;
          r.b += rgbs[j].b * w;
        }
        return r;
}

make us to miss the vectorization even though there is nothing using drgb->o:

sum:
.LFB0:
        .cfi_startproc
        movq    %rdi, %r8
        movq    weights(%rip), %rsi
        movq    addr(%rip), %rdi
        vxorps  %xmm2, %xmm2, %xmm2
        movq    rgbs(%rip), %rcx
        xorl    %edx, %edx
        .p2align 4
        .p2align 3
.L2:
        movslq  (%rdi,%rdx,4), %rax
        vmovsd  (%rsi,%rdx,8), %xmm0
        incq    %rdx
        leaq    (%rax,%rax,2), %rax
        addq    %rcx, %rax
        movzbl  (%rax), %r9d
        vcvtsi2sdl      %r9d, %xmm2, %xmm1
        movzbl  1(%rax), %r9d
        movzbl  2(%rax), %eax
        vfmadd231sd     %xmm0, %xmm1, %xmm3
        vcvtsi2sdl      %r9d, %xmm2, %xmm1
        vfmadd231sd     %xmm0, %xmm1, %xmm5
        vcvtsi2sdl      %eax, %xmm2, %xmm1
        vfmadd231sd     %xmm0, %xmm1, %xmm4
        cmpq    $100000, %rdx
        jne     .L2
        vmovq   %xmm4, %xmm4
        vunpcklpd       %xmm5, %xmm3, %xmm0
        movq    %r8, %rax
        vinsertf128     $0x1, %xmm4, %ymm0, %ymm0
        vmovupd %ymm0, (%r8)
        vzeroupper
        ret

next prev parent reply	other threads:[~2023-05-28 18:11 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-11 14:25 [Bug tree-optimization/109812] New: GraphicsMagick resize is a lot slower in GCC 13.1 vs Clang 16 aros at gmx dot com
2023-05-11 14:26 ` [Bug tree-optimization/109812] " aros at gmx dot com
2023-05-11 15:20 ` [Bug target/109812] " pinskia at gcc dot gnu.org
2023-05-11 15:50 ` aros at gmx dot com
2023-05-12  8:47 ` aros at gmx dot com
2023-05-16 22:43 ` juzhe.zhong at rivai dot ai
2023-05-17  0:08 ` sjames at gcc dot gnu.org
2023-05-28 16:46 ` hubicka at gcc dot gnu.org
2023-05-28 17:29 ` [Bug target/109812] GraphicsMagick resize is a lot slower in GCC 13.1 vs Clang 16 on Intel Raptor Lake hubicka at gcc dot gnu.org
2023-05-28 17:39 ` hubicka at gcc dot gnu.org
2023-05-28 18:11 ` hubicka at gcc dot gnu.org [this message]
2023-05-28 18:50 ` hubicka at gcc dot gnu.org
2023-05-30  0:05 ` zhangjungcc at gmail dot com
2023-05-31 12:42 ` hubicka at ucw dot cz
2023-05-31 16:11 ` hubicka at gcc dot gnu.org
2023-05-31 16:52 ` jamborm at gcc dot gnu.org
2023-06-01  9:38 ` jamborm at gcc dot gnu.org
2023-06-01 11:19 ` jakub at gcc dot gnu.org
2023-06-01 12:28 ` hubicka at gcc dot gnu.org
2023-06-21  9:46 ` ubizjak at gmail dot com
2023-10-12  4:48 ` cvs-commit at gcc dot gnu.org
2023-11-24 23:38 ` hubicka at gcc dot gnu.org
2023-11-25 10:21 ` liuhongt at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-109812-4-AAH1RonwxN@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).