[Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

From: "rguenth at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
Date: Fri, 25 Nov 2022 08:16:15 +0000	[thread overview]
Message-ID: <bug-97832-4-Wcj0wdNKgR@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-97832-4@http.gcc.gnu.org/bugzilla/>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832

--- Comment #15 from Richard Biener <rguenth at gcc dot gnu.org> ---
I can confirm we get

.L3:
        vmovupd (%rsi), %ymm1
        vmovupd 32(%rsi), %ymm0
        addl    $1, %eax
        addq    $64, %rdi
        addq    $64, %rsi
        vblendpd        $14, %ymm1, %ymm0, %ymm3
        vblendpd        $14, %ymm0, %ymm1, %ymm2
        vfnmadd213pd    -64(%rdi), %ymm5, %ymm3
        vfmadd213pd     -32(%rdi), %ymm7, %ymm1
        vfnmadd132pd    %ymm4, %ymm3, %ymm2
        vfnmadd132pd    %ymm6, %ymm1, %ymm0
        vmovupd %ymm2, -64(%rdi)
        vmovupd %ymm0, -32(%rdi)
        cmpl    %edx, %eax
        jb      .L3

instead of

.L3:
        vmovupd (%rdx), %ymm1
        vmovupd (%rdx), %ymm0
        addl    $1, %ecx
        addq    $64, %rax
        vfmadd213pd     -32(%rax), %ymm3, %ymm1
        vfnmadd213pd    -64(%rax), %ymm2, %ymm0
        addq    $64, %rdx
        vfnmadd231pd    -32(%rdx), %ymm3, %ymm0
        vfnmadd231pd    -32(%rdx), %ymm2, %ymm1
        vmovupd %ymm0, -64(%rax)
        vmovupd %ymm1, -32(%rax)
        cmpl    %esi, %ecx
        jb      .L3

the good case sees

  <bb 4> [local count: 214748368]:
  # ivtmp.27_211 = PHI <ivtmp.27_210(4), 0(3)>
  # ivtmp.32_209 = PHI <ivtmp.32_208(4), ivtmp.32_212(3)>
  # ivtmp.34_28 = PHI <ivtmp.34_51(4), ivtmp.34_52(3)>
  _53 = (void *) ivtmp.34_28;
  vect_x_re_54.13_193 = MEM <const vector(4) double> [(const double *)_53];
  vect_x_im_60.21_176 = MEM <const vector(4) double> [(const double *)_53 +
32B];
  _54 = (void *) ivtmp.32_209;
  vect_y_re_62.9_200 = MEM <vector(4) double> [(double *)_54];
  vect_y_re_62.10_198 = MEM <vector(4) double> [(double *)_54 + 32B];
  vect__154.17_185 = .FMA (vect_x_re_54.13_193, _197, vect_y_re_62.10_198);
  vect__66.16_188 = .FNMA (vect_x_re_54.13_193, _196, vect_y_re_62.9_200);
  vect_y_re_68.23_173 = .FNMA (vect_x_im_60.21_176, _197, vect__66.16_188);
  vect_y_re_68.23_172 = .FNMA (vect_x_im_60.21_176, _196, vect__154.17_185);
  MEM <vector(4) double> [(double *)_54] = vect_y_re_68.23_173;
  MEM <vector(4) double> [(double *)_54 + 32B] = vect_y_re_68.23_172;
  ivtmp.27_210 = ivtmp.27_211 + 1;
  ivtmp.32_208 = ivtmp.32_209 + 64;
  ivtmp.34_51 = ivtmp.34_28 + 64;
  if (bnd.6_207 > ivtmp.27_210)
    goto <bb 4>; [90.00%]

while the bad has

  <bb 4> [local count: 214748368]:
  # ivtmp.31_65 = PHI <ivtmp.31_64(4), 0(3)>
  # ivtmp.36_63 = PHI <ivtmp.36_62(4), ivtmp.36_204(3)>
  # ivtmp.38_203 = PHI <ivtmp.38_59(4), ivtmp.38_60(3)>
  _61 = (void *) ivtmp.38_203;
  vect_x_im_60.13_211 = MEM <const vector(4) double> [(const double *)_61];
  vect_x_im_60.14_209 = MEM <const vector(4) double> [(const double *)_61 +
32B];
  vect_x_re_54.15_208 = VEC_PERM_EXPR <vect_x_im_60.14_209,
vect_x_im_60.13_211, { 0, 5, 6, 7 }>;
  vect_x_re_54.23_192 = VEC_PERM_EXPR <vect_x_im_60.13_211,
vect_x_im_60.14_209, { 0, 5, 6, 7 }>;
  _58 = (void *) ivtmp.36_63;
  vect_y_re_62.9_218 = MEM <vector(4) double> [(double *)_58];
  vect_y_re_62.10_216 = MEM <vector(4) double> [(double *)_58 + 32B];
  vect__41.18_202 = .FMA (vect_x_im_60.13_211, _215, vect_y_re_62.10_216);
  vect_y_re_68.17_205 = .FNMA (vect_x_re_54.15_208, _214, vect_y_re_62.9_218);
  vect_y_re_68.25_189 = .FNMA (vect_x_re_54.23_192, _198, vect_y_re_68.17_205);
  vect_y_re_68.25_188 = .FNMA (_199, vect_x_im_60.14_209, vect__41.18_202);
  MEM <vector(4) double> [(double *)_58] = vect_y_re_68.25_189;
  MEM <vector(4) double> [(double *)_58 + 32B] = vect_y_re_68.25_188;
  ivtmp.31_64 = ivtmp.31_65 + 1;
  ivtmp.36_62 = ivtmp.36_63 + 64;
  ivtmp.38_59 = ivtmp.38_203 + 64;
  if (ivtmp.31_64 < bnd.6_225)
    goto <bb 4>; [90.00%]

the blends do not look like no-ops so I wonder if this is really computing
the same thing ... (it swaps lane 0 from the two loads from x but not the
stores)

next prev parent reply	other threads:[~2022-11-25  8:16 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-14 20:44 [Bug target/97832] New: " already5chosen at yahoo dot com
2020-11-16  7:21 ` [Bug target/97832] " rguenth at gcc dot gnu.org
2020-11-16 11:11 ` rguenth at gcc dot gnu.org
2020-11-16 20:11 ` already5chosen at yahoo dot com
2020-11-17  9:21 ` [Bug tree-optimization/97832] " rguenth at gcc dot gnu.org
2020-11-17 10:18 ` rguenth at gcc dot gnu.org
2020-11-18  8:53 ` rguenth at gcc dot gnu.org
2020-11-18  9:15 ` rguenth at gcc dot gnu.org
2020-11-18 13:23 ` rguenth at gcc dot gnu.org
2020-11-18 13:39 ` rguenth at gcc dot gnu.org
2020-11-19 19:55 ` already5chosen at yahoo dot com
2020-11-20  7:10 ` rguenth at gcc dot gnu.org
2021-06-09 12:41 ` cvs-commit at gcc dot gnu.org
2021-06-09 12:54 ` rguenth at gcc dot gnu.org
2022-01-21  0:16 ` pinskia at gcc dot gnu.org
2022-11-24 23:22 ` already5chosen at yahoo dot com
2022-11-25  8:16 ` rguenth at gcc dot gnu.org [this message]
2022-11-25 13:19 ` already5chosen at yahoo dot com
2022-11-25 20:46 ` rguenth at gcc dot gnu.org
2022-11-25 21:27 ` amonakov at gcc dot gnu.org
2022-11-26 18:27 ` already5chosen at yahoo dot com
2022-11-26 18:36 ` already5chosen at yahoo dot com
2022-11-26 19:36 ` amonakov at gcc dot gnu.org
2022-11-26 22:00 ` already5chosen at yahoo dot com
2022-11-28  6:29 ` crazylht at gmail dot com
2022-11-28  6:42 ` crazylht at gmail dot com
2022-11-28  7:21 ` rguenther at suse dot de
2022-11-28  7:24 ` crazylht at gmail dot com

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-97832-4-Wcj0wdNKgR@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).