From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id C22E23947418; Fri, 25 Nov 2022 08:16:26 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C22E23947418 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1669364186; bh=CDVmxwyre48ba8rImmGOei2ym/2rdfItoG4Qf59Hanw=; h=From:To:Subject:Date:In-Reply-To:References:From; b=C8TNlzyoyDZjZN31ZHOr8ytlTKYz8ZOUhnz4BBW0slBSZIljFgmY+OgmM6WY3FfbM hzsSJ8Wam6zEgQ1LImikpsBFfSZ23tr90GvFJCKNjHL7ccjaUisfBQH0aNj+pGSjlo O8lIduHmSGMBNtYsio9jdtEimtej9yPpXk/ejuVc= From: "rguenth at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 Date: Fri, 25 Nov 2022 08:16:15 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: 10.2.0 X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: normal X-Bugzilla-Who: rguenth at gcc dot gnu.org X-Bugzilla-Status: RESOLVED X-Bugzilla-Resolution: FIXED X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: rguenth at gcc dot gnu.org X-Bugzilla-Target-Milestone: 12.0 X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D97832 --- Comment #15 from Richard Biener --- I can confirm we get .L3: vmovupd (%rsi), %ymm1 vmovupd 32(%rsi), %ymm0 addl $1, %eax addq $64, %rdi addq $64, %rsi vblendpd $14, %ymm1, %ymm0, %ymm3 vblendpd $14, %ymm0, %ymm1, %ymm2 vfnmadd213pd -64(%rdi), %ymm5, %ymm3 vfmadd213pd -32(%rdi), %ymm7, %ymm1 vfnmadd132pd %ymm4, %ymm3, %ymm2 vfnmadd132pd %ymm6, %ymm1, %ymm0 vmovupd %ymm2, -64(%rdi) vmovupd %ymm0, -32(%rdi) cmpl %edx, %eax jb .L3 instead of .L3: vmovupd (%rdx), %ymm1 vmovupd (%rdx), %ymm0 addl $1, %ecx addq $64, %rax vfmadd213pd -32(%rax), %ymm3, %ymm1 vfnmadd213pd -64(%rax), %ymm2, %ymm0 addq $64, %rdx vfnmadd231pd -32(%rdx), %ymm3, %ymm0 vfnmadd231pd -32(%rdx), %ymm2, %ymm1 vmovupd %ymm0, -64(%rax) vmovupd %ymm1, -32(%rax) cmpl %esi, %ecx jb .L3 the good case sees [local count: 214748368]: # ivtmp.27_211 =3D PHI # ivtmp.32_209 =3D PHI # ivtmp.34_28 =3D PHI _53 =3D (void *) ivtmp.34_28; vect_x_re_54.13_193 =3D MEM [(const double *)_53= ]; vect_x_im_60.21_176 =3D MEM [(const double *)_53= + 32B]; _54 =3D (void *) ivtmp.32_209; vect_y_re_62.9_200 =3D MEM [(double *)_54]; vect_y_re_62.10_198 =3D MEM [(double *)_54 + 32B]; vect__154.17_185 =3D .FMA (vect_x_re_54.13_193, _197, vect_y_re_62.10_198= ); vect__66.16_188 =3D .FNMA (vect_x_re_54.13_193, _196, vect_y_re_62.9_200); vect_y_re_68.23_173 =3D .FNMA (vect_x_im_60.21_176, _197, vect__66.16_188= ); vect_y_re_68.23_172 =3D .FNMA (vect_x_im_60.21_176, _196, vect__154.17_18= 5); MEM [(double *)_54] =3D vect_y_re_68.23_173; MEM [(double *)_54 + 32B] =3D vect_y_re_68.23_172; ivtmp.27_210 =3D ivtmp.27_211 + 1; ivtmp.32_208 =3D ivtmp.32_209 + 64; ivtmp.34_51 =3D ivtmp.34_28 + 64; if (bnd.6_207 > ivtmp.27_210) goto ; [90.00%] while the bad has [local count: 214748368]: # ivtmp.31_65 =3D PHI # ivtmp.36_63 =3D PHI # ivtmp.38_203 =3D PHI _61 =3D (void *) ivtmp.38_203; vect_x_im_60.13_211 =3D MEM [(const double *)_61= ]; vect_x_im_60.14_209 =3D MEM [(const double *)_61= + 32B]; vect_x_re_54.15_208 =3D VEC_PERM_EXPR ; vect_x_re_54.23_192 =3D VEC_PERM_EXPR ; _58 =3D (void *) ivtmp.36_63; vect_y_re_62.9_218 =3D MEM [(double *)_58]; vect_y_re_62.10_216 =3D MEM [(double *)_58 + 32B]; vect__41.18_202 =3D .FMA (vect_x_im_60.13_211, _215, vect_y_re_62.10_216); vect_y_re_68.17_205 =3D .FNMA (vect_x_re_54.15_208, _214, vect_y_re_62.9_= 218); vect_y_re_68.25_189 =3D .FNMA (vect_x_re_54.23_192, _198, vect_y_re_68.17= _205); vect_y_re_68.25_188 =3D .FNMA (_199, vect_x_im_60.14_209, vect__41.18_202= ); MEM [(double *)_58] =3D vect_y_re_68.25_189; MEM [(double *)_58 + 32B] =3D vect_y_re_68.25_188; ivtmp.31_64 =3D ivtmp.31_65 + 1; ivtmp.36_62 =3D ivtmp.36_63 + 64; ivtmp.38_59 =3D ivtmp.38_203 + 64; if (ivtmp.31_64 < bnd.6_225) goto ; [90.00%] the blends do not look like no-ops so I wonder if this is really computing the same thing ... (it swaps lane 0 from the two loads from x but not the stores)=