From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id CD6D2385842C; Sun, 28 May 2023 18:11:03 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org CD6D2385842C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1685297463; bh=wm6qQFfcKcE2J6vWslK8MVbtLyFBPloFxoscu9cqG9k=; h=From:To:Subject:Date:In-Reply-To:References:From; b=gbdGGQ5SCCPe8JOJBWPqE33Zp8JBYjeuLoGgiag3Zg3pGMatGd4TI4QpgD1RanHUQ XgWG8qd1kL5oFlu03YYGx6w3rT/oprlldDO75eJiq1H3i+cvHuBKNA+lzm1dXyOoKj 01BdVSsqymjPedaA+FQRidWFDHZSdftVpBgiNpBc= From: "hubicka at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/109812] GraphicsMagick resize is a lot slower in GCC 13.1 vs Clang 16 on Intel Raptor Lake Date: Sun, 28 May 2023 18:11:02 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 13.1.1 X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: normal X-Bugzilla-Who: hubicka at gcc dot gnu.org X-Bugzilla-Status: NEW X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109812 --- Comment #9 from Jan Hubicka --- Oddly enough simplified version of the loop SLP vectorizes for me: struct rgb {unsigned char r,g,b;} *rgbs; int *addr; double *weights; struct drgb {double r,g,b;}; struct drgb sum() { struct drgb r; for (int i =3D 0; i < 100000; i++) { int j =3D addr[i]; double w =3D weights[i]; r.r +=3D rgbs[j].r * w; r.g +=3D rgbs[j].g * w; r.b +=3D rgbs[j].b * w; } return r; } I get: L2: movslq (%r9,%rdx,4), %rax vmovsd (%r8,%rdx,8), %xmm1 incq %rdx leaq (%rax,%rax,2), %rax addq %rsi, %rax movzbl (%rax), %ecx vmovddup %xmm1, %xmm4 vmovd %ecx, %xmm0 movzbl 1(%rax), %ecx movzbl 2(%rax), %eax vpinsrd $1, %ecx, %xmm0, %xmm0 vcvtdq2pd %xmm0, %xmm0 vfmadd231pd %xmm4, %xmm0, %xmm2 vcvtsi2sdl %eax, %xmm5, %xmm0 vfmadd231sd %xmm1, %xmm0, %xmm3 cmpq $100000, %rdx jne .L2 I think the actual loop is: [local count: 44202554]: _106 =3D _262->pixel; _109 =3D *source_231(D).columns; [local count: 401841405]: # pixel$green_332 =3D PHI <_124(89), pixel$green_265(53)> # i_357 =3D PHI # pixel$red_371 =3D PHI <_119(89), pixel$red_263(53)> # pixel$blue_377 =3D PHI <_129(89), pixel$blue_267(53)> i.51_102 =3D (long unsigned int) i_357; _103 =3D i.51_102 * 16; _104 =3D _262 + _103; _105 =3D _104->pixel; _107 =3D _105 - _106; _108 =3D (long unsigned int) _107; _110 =3D _108 * _109; _112 =3D _110 + _621; weight_297 =3D _104->weight; _113 =3D _112 * 4; _114 =3D _276 + _113; _115 =3D _114->red; _116 =3D (int) _115; _117 =3D (double) _116; _118 =3D _117 * weight_297; _119 =3D _118 + pixel$red_371; _120 =3D _114->green; _121 =3D (int) _120; _122 =3D (double) _121; _123 =3D _122 * weight_297; _124 =3D _123 + pixel$green_332; _125 =3D _114->blue; _126 =3D (int) _125; _127 =3D (double) _126; _128 =3D _127 * weight_297; _129 =3D _128 + pixel$blue_377; i_298 =3D i_357 + 1; if (n_195 > i_298) goto ; [89.00%] else goto ; [11.00%] [local count: 44202554]: # _607 =3D PHI <_124(54)> # _606 =3D PHI <_119(54)> # _605 =3D PHI <_129(54)> goto ; [100.00%] [local count: 357638851]: goto ; [100.00%] and SLP vectorizer seems to claim: ../magick/resize.c:1284:52: note: _125 =3D _114->blue; ../magick/resize.c:1284:52: note: _120 =3D _114->green; ../magick/resize.c:1284:52: note: _115 =3D _114->red; ../magick/resize.c:1284:52: missed: not consecutive access weight_297 =3D _104->weight; ../magick/resize.c:1284:52: missed: not consecutive access _105 =3D _104->pixel; ../magick/resize.c:1284:52: missed: not consecutive access _134->red =3D iftmp.57_207; ../magick/resize.c:1284:52: missed: not consecutive access _134->green =3D iftmp.60_208; ../magick/resize.c:1284:52: missed: not consecutive access _134->blue =3D iftmp.63_209; ../magick/resize.c:1284:52: missed: not consecutive access _134->opacity = =3D 0; ../magick/resize.c:1284:52: missed: not consecutive access _63 =3D *source_231(D).columns; ../magick/resize.c:1284:52: missed: not consecutive access _60 =3D _262->= pixel; Not sure if that is related to the real testcase: struct rgb {unsigned char r,g,b;} *rgbs; int *addr; double *weights; struct drgb {double r,g,b,o;}; struct drgb sum() { struct drgb r; for (int i =3D 0; i < 100000; i++) { int j =3D addr[i]; double w =3D weights[i]; r.r +=3D rgbs[j].r * w; r.g +=3D rgbs[j].g * w; r.b +=3D rgbs[j].b * w; } return r; } make us to miss the vectorization even though there is nothing using drgb->= o: sum: .LFB0: .cfi_startproc movq %rdi, %r8 movq weights(%rip), %rsi movq addr(%rip), %rdi vxorps %xmm2, %xmm2, %xmm2 movq rgbs(%rip), %rcx xorl %edx, %edx .p2align 4 .p2align 3 .L2: movslq (%rdi,%rdx,4), %rax vmovsd (%rsi,%rdx,8), %xmm0 incq %rdx leaq (%rax,%rax,2), %rax addq %rcx, %rax movzbl (%rax), %r9d vcvtsi2sdl %r9d, %xmm2, %xmm1 movzbl 1(%rax), %r9d movzbl 2(%rax), %eax vfmadd231sd %xmm0, %xmm1, %xmm3 vcvtsi2sdl %r9d, %xmm2, %xmm1 vfmadd231sd %xmm0, %xmm1, %xmm5 vcvtsi2sdl %eax, %xmm2, %xmm1 vfmadd231sd %xmm0, %xmm1, %xmm4 cmpq $100000, %rdx jne .L2 vmovq %xmm4, %xmm4 vunpcklpd %xmm5, %xmm3, %xmm0 movq %r8, %rax vinsertf128 $0x1, %xmm4, %ymm0, %ymm0 vmovupd %ymm0, (%r8) vzeroupper ret=