From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id CD6D2385842C; Sun, 28 May 2023 18:11:03 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org CD6D2385842C
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1685297463;
	bh=wm6qQFfcKcE2J6vWslK8MVbtLyFBPloFxoscu9cqG9k=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=gbdGGQ5SCCPe8JOJBWPqE33Zp8JBYjeuLoGgiag3Zg3pGMatGd4TI4QpgD1RanHUQ
	 XgWG8qd1kL5oFlu03YYGx6w3rT/oprlldDO75eJiq1H3i+cvHuBKNA+lzm1dXyOoKj
	 01BdVSsqymjPedaA+FQRidWFDHZSdftVpBgiNpBc=
From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/109812] GraphicsMagick resize is a lot slower in GCC
 13.1 vs Clang 16 on Intel Raptor Lake
Date: Sun, 28 May 2023 18:11:02 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 13.1.1
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: hubicka at gcc dot gnu.org
X-Bugzilla-Status: NEW
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-109812-4-AAH1RonwxN@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-109812-4@http.gcc.gnu.org/bugzilla/>
References: <bug-109812-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109812
--- Comment #9 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Oddly enough simplified version of the loop SLP vectorizes for me:
struct rgb {unsigned char r,g,b;} *rgbs;
int *addr;
double *weights;
struct drgb {double r,g,b;};

struct drgb sum()
{
        struct drgb r;
        for (int i =3D 0; i < 100000; i++)
        {
          int j =3D addr[i];
          double w =3D weights[i];
          r.r +=3D rgbs[j].r * w;
          r.g +=3D rgbs[j].g * w;
          r.b +=3D rgbs[j].b * w;
        }
        return r;
}
I get:
L2:
        movslq  (%r9,%rdx,4), %rax
        vmovsd  (%r8,%rdx,8), %xmm1
        incq    %rdx
        leaq    (%rax,%rax,2), %rax
        addq    %rsi, %rax
        movzbl  (%rax), %ecx
        vmovddup        %xmm1, %xmm4
        vmovd   %ecx, %xmm0
        movzbl  1(%rax), %ecx
        movzbl  2(%rax), %eax
        vpinsrd $1, %ecx, %xmm0, %xmm0
        vcvtdq2pd       %xmm0, %xmm0
        vfmadd231pd     %xmm4, %xmm0, %xmm2
        vcvtsi2sdl      %eax, %xmm5, %xmm0
        vfmadd231sd     %xmm1, %xmm0, %xmm3
        cmpq    $100000, %rdx
        jne     .L2


I think the actual loop is:
 <bb 53> [local count: 44202554]:
  _106 =3D _262->pixel;
  _109 =3D *source_231(D).columns;

  <bb 54> [local count: 401841405]:
  # pixel$green_332 =3D PHI <_124(89), pixel$green_265(53)>
  # i_357 =3D PHI <i_298(89), 0(53)>
  # pixel$red_371 =3D PHI <_119(89), pixel$red_263(53)>
  # pixel$blue_377 =3D PHI <_129(89), pixel$blue_267(53)>
  i.51_102 =3D (long unsigned int) i_357;
  _103 =3D i.51_102 * 16;
  _104 =3D _262 + _103;
  _105 =3D _104->pixel;
  _107 =3D _105 - _106;
  _108 =3D (long unsigned int) _107;
  _110 =3D _108 * _109;
  _112 =3D _110 + _621;
  weight_297 =3D _104->weight;
  _113 =3D _112 * 4;
  _114 =3D _276 + _113;
  _115 =3D _114->red;
  _116 =3D (int) _115;
  _117 =3D (double) _116;
  _118 =3D _117 * weight_297;
  _119 =3D _118 + pixel$red_371;
  _120 =3D _114->green;
 _121 =3D (int) _120;
  _122 =3D (double) _121;
  _123 =3D _122 * weight_297;
  _124 =3D _123 + pixel$green_332;
  _125 =3D _114->blue;
  _126 =3D (int) _125;
  _127 =3D (double) _126;
  _128 =3D _127 * weight_297;
  _129 =3D _128 + pixel$blue_377;
  i_298 =3D i_357 + 1;
  if (n_195 > i_298)
    goto <bb 89>; [89.00%]
  else
    goto <bb 118>; [11.00%]

  <bb 118> [local count: 44202554]:
  # _607 =3D PHI <_124(54)>
  # _606 =3D PHI <_119(54)>
  # _605 =3D PHI <_129(54)>
  goto <bb 55>; [100.00%]

  <bb 89> [local count: 357638851]:
  goto <bb 54>; [100.00%]


and SLP vectorizer seems to claim:
../magick/resize.c:1284:52: note:       _125 =3D _114->blue;
../magick/resize.c:1284:52: note:       _120 =3D _114->green;
../magick/resize.c:1284:52: note:       _115 =3D _114->red;
../magick/resize.c:1284:52: missed:   not consecutive access weight_297 =3D
_104->weight;
../magick/resize.c:1284:52: missed:   not consecutive access _105 =3D
_104->pixel;
../magick/resize.c:1284:52: missed:   not consecutive access _134->red =3D
iftmp.57_207;
../magick/resize.c:1284:52: missed:   not consecutive access _134->green =3D
iftmp.60_208;
../magick/resize.c:1284:52: missed:   not consecutive access _134->blue =3D
iftmp.63_209;
../magick/resize.c:1284:52: missed:   not consecutive access _134->opacity =
=3D 0;
../magick/resize.c:1284:52: missed:   not consecutive access _63 =3D
*source_231(D).columns;
../magick/resize.c:1284:52: missed:   not consecutive access _60 =3D _262->=
pixel;

Not sure if that is related to the real testcase:


struct rgb {unsigned char r,g,b;} *rgbs;
int *addr;
double *weights;
struct drgb {double r,g,b,o;};

struct drgb sum()
{
        struct drgb r;
        for (int i =3D 0; i < 100000; i++)
        {
          int j =3D addr[i];
          double w =3D weights[i];
          r.r +=3D rgbs[j].r * w;
          r.g +=3D rgbs[j].g * w;
          r.b +=3D rgbs[j].b * w;
        }
        return r;
}

make us to miss the vectorization even though there is nothing using drgb->=
o:

sum:
.LFB0:
        .cfi_startproc
        movq    %rdi, %r8
        movq    weights(%rip), %rsi
        movq    addr(%rip), %rdi
        vxorps  %xmm2, %xmm2, %xmm2
        movq    rgbs(%rip), %rcx
        xorl    %edx, %edx
        .p2align 4
        .p2align 3
.L2:
        movslq  (%rdi,%rdx,4), %rax
        vmovsd  (%rsi,%rdx,8), %xmm0
        incq    %rdx
        leaq    (%rax,%rax,2), %rax
        addq    %rcx, %rax
        movzbl  (%rax), %r9d
        vcvtsi2sdl      %r9d, %xmm2, %xmm1
        movzbl  1(%rax), %r9d
        movzbl  2(%rax), %eax
        vfmadd231sd     %xmm0, %xmm1, %xmm3
        vcvtsi2sdl      %r9d, %xmm2, %xmm1
        vfmadd231sd     %xmm0, %xmm1, %xmm5
        vcvtsi2sdl      %eax, %xmm2, %xmm1
        vfmadd231sd     %xmm0, %xmm1, %xmm4
        cmpq    $100000, %rdx
        jne     .L2
        vmovq   %xmm4, %xmm4
        vunpcklpd       %xmm5, %xmm3, %xmm0
        movq    %r8, %rax
        vinsertf128     $0x1, %xmm4, %ymm0, %ymm0
        vmovupd %ymm0, (%r8)
        vzeroupper
        ret=