public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
@ 2020-11-14 20:44 already5chosen at yahoo dot com
2020-11-16 7:21 ` [Bug target/97832] " rguenth at gcc dot gnu.org
` (26 more replies)
0 siblings, 27 replies; 28+ messages in thread
From: already5chosen at yahoo dot com @ 2020-11-14 20:44 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
Bug ID: 97832
Summary: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7
times slower than -O3
Product: gcc
Version: 10.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: already5chosen at yahoo dot com
Target Milestone: ---
I am reporting under 'target' because AVX2+FMA is the only 256-bit SIMD
platform I have to play with. If it's really tree-optomization, please change.
void foo(double* restrict y, const double* restrict x0, const double* restrict
x1, int clen)
{
int xi = clen & 2;
double f00_re = x0[0+xi+0];
double f10_re = x1[0+xi+0];
double f01_re = x0[0+xi+1];
double f11_re = x1[0+xi+1];
double f00_im = x0[4+xi+0];
double f10_im = x1[4+xi+0];
double f01_im = x0[4+xi+1];
double f11_im = x1[4+xi+1];
int clen2 = (clen+xi) * 2;
double* y0 = &y[0];
double* y1 = &y[clen2];
#pragma GCC unroll 0
for (int c = 0; c < clen2; c += 8) {
// y0[c] = y0[c] - x0[c]*conj(f00) - x1[c]*conj(f10);
// y1[c] = y1[c] - x0[c]*conj(f01) - x1[c]*conj(f11);
#pragma GCC unroll 4
for (int k = 0; k < 4; ++k) {
double x0_re = x0[c+0+k];
double x0_im = x0[c+4+k];
double y0_re = y0[c+0+k];
double y0_im = y0[c+4+k];
double y1_re = y1[c+0+k];
double y1_im = y1[c+4+k];
y0_re = y0_re - x0_re * f00_re - x0_im * f00_im;
y0_im = y0_im + x0_re * f00_im - x0_im * f00_re;
y1_re = y1_re - x0_re * f01_re - x0_im * f01_im;
y1_im = y1_im + x0_re * f01_im - x0_im * f01_re;
double x1_re = x1[c+0+k];
double x1_im = x1[c+4+k];
y0_re = y0_re - x1_re * f10_re - x1_im * f10_im;
y0_im = y0_im + x1_re * f10_im - x1_im * f10_re;
y1_re = y1_re - x1_re * f11_re - x1_im * f11_im;
y1_im = y1_im + x1_re * f11_im - x1_im * f11_re;
y0[c+0+k] = y0_re;
y0[c+4+k] = y0_im;
y1[c+0+k] = y1_re;
y1[c+4+k] = y1_im;
}
}
}
When compiled with 'gcc.10.2. -march=skylake -O3' it produces pretty decent
code. The only problem is over-aggressive load+op combining similar to what we
already discussed in 97127. It seems, this problem can't be solved without
major overhaul of gcc optimizer architecture, but luckily an impact is quite
minor.
But when we compile with 'gcc.10.2. -march=skylake -Ofast' the fun begins:
.L5:
vmovupd (%r9), %ymm7
vmovupd 64(%r9), %ymm6
vunpcklpd 32(%r9), %ymm7, %ymm2
vunpckhpd 32(%r9), %ymm7, %ymm0
vmovupd 64(%r9), %ymm7
vmovupd 192(%r9), %ymm4
vunpckhpd 96(%r9), %ymm7, %ymm5
vmovupd 128(%r9), %ymm7
vunpcklpd 96(%r9), %ymm6, %ymm6
vunpcklpd 160(%r9), %ymm7, %ymm3
vunpckhpd 160(%r9), %ymm7, %ymm1
vmovupd 192(%r9), %ymm7
vunpcklpd 224(%r9), %ymm4, %ymm4
vunpckhpd 224(%r9), %ymm7, %ymm8
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm5, %ymm5
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm8, %ymm8
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm3, %ymm3
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm6, %ymm2, %ymm7
vunpckhpd %ymm6, %ymm2, %ymm2
vunpcklpd %ymm4, %ymm3, %ymm6
vunpckhpd %ymm4, %ymm3, %ymm3
vunpcklpd %ymm5, %ymm0, %ymm4
vunpckhpd %ymm5, %ymm0, %ymm0
vunpcklpd %ymm8, %ymm1, %ymm5
vpermpd $216, %ymm5, %ymm5
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm3, %ymm3
vunpcklpd %ymm5, %ymm4, %ymm11
vpermpd $216, %ymm2, %ymm2
vunpckhpd %ymm5, %ymm4, %ymm4
vunpckhpd %ymm8, %ymm1, %ymm1
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm4, %ymm8
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm3, %ymm2, %ymm4
vunpckhpd %ymm3, %ymm2, %ymm2
vpermpd $216, %ymm2, %ymm5
vunpcklpd %ymm1, %ymm0, %ymm2
vpermpd $216, %ymm4, %ymm10
vpermpd $216, %ymm2, %ymm4
vmovupd 64(%rax), %ymm2
vmovupd (%rax), %ymm3
vmovupd %ymm4, 448(%rsp)
vunpckhpd 96(%rax), %ymm2, %ymm4
vmovupd 128(%rax), %ymm2
vpermpd $216, %ymm6, %ymm6
vunpckhpd %ymm1, %ymm0, %ymm1
vpermpd $216, %ymm7, %ymm7
vunpcklpd 32(%rax), %ymm3, %ymm9
vunpckhpd 32(%rax), %ymm3, %ymm14
vunpckhpd 160(%rax), %ymm2, %ymm0
vmovupd 64(%rax), %ymm3
vunpcklpd %ymm6, %ymm7, %ymm12
vunpckhpd %ymm6, %ymm7, %ymm7
vpermpd $216, %ymm1, %ymm6
vunpcklpd 160(%rax), %ymm2, %ymm1
vmovupd 192(%rax), %ymm2
vunpcklpd 96(%rax), %ymm3, %ymm3
vmovupd %ymm5, 416(%rsp)
vunpcklpd 224(%rax), %ymm2, %ymm5
vunpckhpd 224(%rax), %ymm2, %ymm2
vpermpd $216, %ymm3, %ymm3
vpermpd $216, %ymm5, %ymm5
vpermpd $216, %ymm9, %ymm9
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm0, %ymm0
vmovupd %ymm10, 384(%rsp)
vpermpd $216, %ymm14, %ymm14
vunpcklpd %ymm3, %ymm9, %ymm10
vpermpd $216, %ymm2, %ymm2
vunpckhpd %ymm3, %ymm9, %ymm9
vunpcklpd %ymm5, %ymm1, %ymm3
vpermpd $216, %ymm3, %ymm3
vmovupd %ymm8, 288(%rsp)
vpermpd $216, %ymm10, %ymm10
vunpcklpd %ymm4, %ymm14, %ymm8
vunpckhpd %ymm4, %ymm14, %ymm14
vunpcklpd %ymm2, %ymm0, %ymm4
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm8, %ymm8
vunpckhpd %ymm2, %ymm0, %ymm2
vunpcklpd %ymm3, %ymm10, %ymm0
vpermpd $216, %ymm0, %ymm13
vunpcklpd %ymm4, %ymm8, %ymm0
vunpckhpd %ymm4, %ymm8, %ymm8
vpermpd $216, %ymm2, %ymm2
vunpckhpd %ymm3, %ymm10, %ymm10
vpermpd $216, %ymm14, %ymm14
vpermpd $216, %ymm0, %ymm3
vpermpd $216, %ymm8, %ymm0
vmovupd %ymm6, 480(%rsp)
vunpckhpd %ymm5, %ymm1, %ymm1
vmovupd %ymm3, 512(%rsp)
vmovupd (%rsi), %ymm3
vmovupd %ymm0, 544(%rsp)
vunpcklpd %ymm2, %ymm14, %ymm0
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm0, %ymm4
vpermpd $216, %ymm9, %ymm9
vunpcklpd %ymm1, %ymm9, %ymm6
vmovupd %ymm4, 640(%rsp)
vunpckhpd %ymm1, %ymm9, %ymm9
vunpcklpd 32(%rsi), %ymm3, %ymm4
vunpckhpd 32(%rsi), %ymm3, %ymm1
vmovupd 64(%rsi), %ymm3
vunpckhpd %ymm2, %ymm14, %ymm14
vunpcklpd 96(%rsi), %ymm3, %ymm8
vunpckhpd 96(%rsi), %ymm3, %ymm5
vmovupd 128(%rsi), %ymm3
vpermpd $216, %ymm14, %ymm2
vunpckhpd 160(%rsi), %ymm3, %ymm0
vmovupd %ymm2, 672(%rsp)
vunpcklpd 160(%rsi), %ymm3, %ymm2
vmovupd 192(%rsi), %ymm3
vmovupd 192(%rsi), %ymm14
vunpcklpd 224(%rsi), %ymm3, %ymm3
vpermpd $216, %ymm9, %ymm9
vmovupd %ymm9, 608(%rsp)
vunpckhpd 224(%rsi), %ymm14, %ymm9
vpermpd $216, %ymm8, %ymm8
vpermpd $216, %ymm3, %ymm3
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm5, %ymm5
vpermpd $216, %ymm9, %ymm9
vmovupd %ymm6, 576(%rsp)
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm8, %ymm4, %ymm6
vpermpd $216, %ymm0, %ymm0
vunpckhpd %ymm8, %ymm4, %ymm4
vunpcklpd %ymm3, %ymm2, %ymm8
vpermpd $216, %ymm8, %ymm8
vpermpd $216, %ymm6, %ymm6
vunpckhpd %ymm3, %ymm2, %ymm2
vunpcklpd %ymm5, %ymm1, %ymm3
vunpckhpd %ymm5, %ymm1, %ymm1
vunpcklpd %ymm9, %ymm0, %ymm5
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm5, %ymm5
vunpcklpd %ymm8, %ymm6, %ymm14
vpermpd $216, %ymm4, %ymm4
vunpckhpd %ymm8, %ymm6, %ymm6
vpermpd $216, %ymm3, %ymm3
vunpckhpd %ymm9, %ymm0, %ymm0
vpermpd $216, %ymm6, %ymm9
vunpcklpd %ymm5, %ymm3, %ymm6
vunpckhpd %ymm5, %ymm3, %ymm3
vunpcklpd %ymm2, %ymm4, %ymm5
vunpckhpd %ymm2, %ymm4, %ymm4
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm4, %ymm2
vpermpd $216, %ymm1, %ymm1
vmovupd %ymm2, 832(%rsp)
vunpcklpd %ymm0, %ymm1, %ymm2
vunpckhpd %ymm0, %ymm1, %ymm1
vpermpd $216, %ymm1, %ymm0
vmovupd %ymm0, 896(%rsp)
vmovupd (%rbx), %ymm0
vpermpd $216, %ymm2, %ymm4
vunpckhpd 32(%rbx), %ymm0, %ymm1
vunpcklpd 32(%rbx), %ymm0, %ymm2
vmovupd 64(%rbx), %ymm0
vpermpd $216, %ymm5, %ymm5
vmovupd %ymm5, 800(%rsp)
vmovupd %ymm4, 864(%rsp)
vunpcklpd 96(%rbx), %ymm0, %ymm5
vunpckhpd 96(%rbx), %ymm0, %ymm4
vmovupd 128(%rbx), %ymm0
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm3, %ymm3
vmovupd %ymm9, 704(%rsp)
vmovupd %ymm6, 736(%rsp)
vmovupd %ymm3, 768(%rsp)
vunpcklpd 160(%rbx), %ymm0, %ymm3
vmovupd 192(%rbx), %ymm8
vunpckhpd 160(%rbx), %ymm0, %ymm0
vunpcklpd 224(%rbx), %ymm8, %ymm6
vunpckhpd 224(%rbx), %ymm8, %ymm9
vpermpd $216, %ymm5, %ymm5
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm9, %ymm9
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm3, %ymm3
vpermpd $216, %ymm0, %ymm0
vunpcklpd %ymm5, %ymm2, %ymm8
vunpckhpd %ymm5, %ymm2, %ymm2
vunpcklpd %ymm6, %ymm3, %ymm5
vunpckhpd %ymm6, %ymm3, %ymm3
vunpcklpd %ymm4, %ymm1, %ymm6
vunpckhpd %ymm4, %ymm1, %ymm1
vunpcklpd %ymm9, %ymm0, %ymm4
vunpckhpd %ymm9, %ymm0, %ymm0
vpermpd $216, %ymm5, %ymm5
vpermpd $216, %ymm3, %ymm3
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm8, %ymm8
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm5, %ymm8, %ymm9
vunpckhpd %ymm5, %ymm8, %ymm8
vunpcklpd %ymm4, %ymm6, %ymm5
vunpckhpd %ymm4, %ymm6, %ymm6
vunpcklpd %ymm3, %ymm2, %ymm4
vunpckhpd %ymm3, %ymm2, %ymm2
vunpcklpd %ymm0, %ymm1, %ymm3
vunpckhpd %ymm0, %ymm1, %ymm1
vpermpd $216, %ymm9, %ymm9
vpermpd $216, %ymm8, %ymm8
vpermpd $216, %ymm1, %ymm0
vpermpd $216, %ymm10, %ymm15
vmovupd %ymm0, 240(%rsp)
vmulpd 320(%rsp), %ymm9, %ymm10
vmulpd 64(%rsp), %ymm8, %ymm0
vmovupd (%rsp), %ymm1
vpermpd $216, %ymm12, %ymm12
vpermpd $216, %ymm7, %ymm7
vfmadd231pd 176(%rsp), %ymm12, %ymm10
vfmadd231pd %ymm1, %ymm7, %ymm0
vpermpd $216, %ymm14, %ymm14
vpermpd $216, %ymm11, %ymm11
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm5, %ymm5
vaddpd %ymm10, %ymm0, %ymm0
vmulpd 64(%rsp), %ymm9, %ymm10
vpermpd $216, %ymm2, %ymm2
vsubpd %ymm0, %ymm13, %ymm0
vmulpd 320(%rsp), %ymm8, %ymm13
vpermpd $216, %ymm4, %ymm4
vfmadd231pd %ymm1, %ymm12, %ymm10
vmovupd %ymm0, 352(%rsp)
vmovupd 208(%rsp), %ymm0
vfmadd231pd 176(%rsp), %ymm7, %ymm13
vpermpd $216, %ymm3, %ymm3
addq $256, %r9
addq $256, %rax
addq $256, %rsi
vsubpd %ymm13, %ymm10, %ymm10
vmulpd %ymm0, %ymm9, %ymm13
vmulpd 96(%rsp), %ymm9, %ymm9
vaddpd %ymm15, %ymm10, %ymm10
vmulpd 96(%rsp), %ymm8, %ymm15
vmulpd %ymm0, %ymm8, %ymm8
vmovupd %ymm10, 928(%rsp)
vmovupd 128(%rsp), %ymm10
vfmadd231pd 32(%rsp), %ymm12, %ymm13
vfmadd231pd %ymm10, %ymm12, %ymm9
vmovupd 32(%rsp), %ymm12
vfmadd231pd %ymm10, %ymm7, %ymm15
vfmadd231pd %ymm12, %ymm7, %ymm8
vmovupd (%rsp), %ymm7
addq $256, %rbx
addq $256, %r11
vaddpd %ymm15, %ymm13, %ymm13
vsubpd %ymm8, %ymm9, %ymm9
vmovapd %ymm10, %ymm15
vmovupd 288(%rsp), %ymm10
vsubpd %ymm13, %ymm14, %ymm1
vaddpd 704(%rsp), %ymm9, %ymm13
vmulpd %ymm15, %ymm10, %ymm9
vmulpd %ymm10, %ymm7, %ymm7
vmovupd 176(%rsp), %ymm14
vmovupd 320(%rsp), %ymm15
vmovupd %ymm1, 960(%rsp)
vfmadd231pd %ymm12, %ymm11, %ymm9
vmovupd 64(%rsp), %ymm12
vfmadd231pd %ymm14, %ymm11, %ymm7
vmulpd %ymm12, %ymm6, %ymm8
vmovupd 512(%rsp), %ymm1
vfmadd231pd %ymm15, %ymm5, %ymm8
vaddpd %ymm8, %ymm7, %ymm7
vmulpd %ymm12, %ymm5, %ymm8
vmulpd %ymm15, %ymm6, %ymm12
vsubpd %ymm7, %ymm1, %ymm7
vfmadd231pd (%rsp), %ymm11, %ymm8
vfmadd231pd %ymm14, %ymm10, %ymm12
vsubpd %ymm12, %ymm8, %ymm8
vmulpd 96(%rsp), %ymm6, %ymm12
vmulpd %ymm0, %ymm6, %ymm6
vaddpd 544(%rsp), %ymm8, %ymm8
vmovupd 736(%rsp), %ymm1
vmovupd 288(%rsp), %ymm10
vfmadd231pd %ymm0, %ymm5, %ymm12
vmulpd 96(%rsp), %ymm5, %ymm5
vmovupd 416(%rsp), %ymm0
vaddpd %ymm12, %ymm9, %ymm9
vmovupd 32(%rsp), %ymm12
vsubpd %ymm9, %ymm1, %ymm9
vmovupd 128(%rsp), %ymm1
vfmadd231pd %ymm12, %ymm10, %ymm6
vfmadd231pd %ymm1, %ymm11, %ymm5
vmovupd 384(%rsp), %ymm10
vmovupd %ymm9, 512(%rsp)
vsubpd %ymm6, %ymm5, %ymm11
vmulpd %ymm1, %ymm0, %ymm5
vmovupd (%rsp), %ymm6
vmovupd 576(%rsp), %ymm1
vmulpd %ymm0, %ymm6, %ymm9
vaddpd 768(%rsp), %ymm11, %ymm11
vfmadd231pd %ymm12, %ymm10, %ymm5
vmovupd 64(%rsp), %ymm12
vmulpd %ymm12, %ymm2, %ymm6
vfmadd231pd %ymm10, %ymm14, %ymm9
vfmadd231pd %ymm15, %ymm4, %ymm6
vaddpd %ymm9, %ymm6, %ymm6
vmulpd %ymm12, %ymm4, %ymm9
vmulpd %ymm15, %ymm2, %ymm12
vsubpd %ymm6, %ymm1, %ymm6
vmovupd 800(%rsp), %ymm1
vfmadd231pd (%rsp), %ymm10, %ymm9
vfmadd231pd %ymm14, %ymm0, %ymm12
vsubpd %ymm12, %ymm9, %ymm9
vmulpd 96(%rsp), %ymm2, %ymm12
vmulpd 208(%rsp), %ymm2, %ymm2
vaddpd 608(%rsp), %ymm9, %ymm9
vfmadd231pd 208(%rsp), %ymm4, %ymm12
vmulpd 96(%rsp), %ymm4, %ymm4
vaddpd %ymm12, %ymm5, %ymm5
vfmadd231pd 128(%rsp), %ymm10, %ymm4
vmovupd 480(%rsp), %ymm10
vsubpd %ymm5, %ymm1, %ymm5
vmovapd %ymm0, %ymm1
vmovupd 32(%rsp), %ymm0
vfmadd231pd %ymm0, %ymm1, %ymm2
vmovupd 448(%rsp), %ymm1
vsubpd %ymm2, %ymm4, %ymm4
vmovupd (%rsp), %ymm2
vmulpd %ymm10, %ymm2, %ymm12
vmulpd 128(%rsp), %ymm10, %ymm2
vaddpd 832(%rsp), %ymm4, %ymm4
vfmadd231pd %ymm1, %ymm14, %ymm12
vfmadd231pd %ymm0, %ymm1, %ymm2
vmovupd 240(%rsp), %ymm0
vmulpd 64(%rsp), %ymm0, %ymm14
vfmadd231pd %ymm15, %ymm3, %ymm14
vmulpd %ymm0, %ymm15, %ymm15
vaddpd %ymm14, %ymm12, %ymm12
vmovupd 640(%rsp), %ymm14
vfmadd231pd 176(%rsp), %ymm10, %ymm15
vsubpd %ymm12, %ymm14, %ymm12
vmulpd 64(%rsp), %ymm3, %ymm14
vfmadd231pd (%rsp), %ymm1, %ymm14
vsubpd %ymm15, %ymm14, %ymm14
vaddpd 672(%rsp), %ymm14, %ymm14
vmulpd 96(%rsp), %ymm0, %ymm15
vmovupd 208(%rsp), %ymm0
vfmadd231pd %ymm0, %ymm3, %ymm15
vmulpd 96(%rsp), %ymm3, %ymm3
vaddpd %ymm15, %ymm2, %ymm2
vmovupd 864(%rsp), %ymm15
vfmadd231pd 128(%rsp), %ymm1, %ymm3
vsubpd %ymm2, %ymm15, %ymm2
vmovupd 240(%rsp), %ymm15
vmulpd %ymm0, %ymm15, %ymm1
vpermpd $68, 352(%rsp), %ymm15
vpermpd $238, 352(%rsp), %ymm0
vfmadd231pd 32(%rsp), %ymm10, %ymm1
vmovupd 928(%rsp), %ymm10
vsubpd %ymm1, %ymm3, %ymm1
vpermpd $68, %ymm10, %ymm3
vpermpd $238, %ymm10, %ymm10
vshufpd $12, %ymm3, %ymm15, %ymm3
vshufpd $12, %ymm10, %ymm0, %ymm10
vpermpd $68, %ymm7, %ymm15
vpermpd $68, %ymm8, %ymm0
vpermpd $238, %ymm7, %ymm7
vpermpd $238, %ymm8, %ymm8
vshufpd $12, %ymm0, %ymm15, %ymm15
vshufpd $12, %ymm8, %ymm7, %ymm7
vpermpd $68, %ymm9, %ymm0
vpermpd $68, %ymm6, %ymm8
vshufpd $12, %ymm0, %ymm8, %ymm8
vpermpd $238, %ymm6, %ymm6
vpermpd $238, %ymm9, %ymm0
vshufpd $12, %ymm0, %ymm6, %ymm0
vpermpd $68, %ymm12, %ymm9
vpermpd $68, %ymm14, %ymm6
vpermpd $238, %ymm12, %ymm12
vpermpd $238, %ymm14, %ymm14
vshufpd $12, %ymm6, %ymm9, %ymm6
vshufpd $12, %ymm14, %ymm12, %ymm12
vpermpd $68, %ymm8, %ymm9
vpermpd $68, %ymm3, %ymm14
vpermpd $238, %ymm8, %ymm8
vpermpd $238, %ymm3, %ymm3
vshufpd $12, %ymm9, %ymm14, %ymm9
vshufpd $12, %ymm8, %ymm3, %ymm8
vpermpd $68, %ymm10, %ymm14
vpermpd $68, %ymm0, %ymm3
vpermpd $238, %ymm10, %ymm10
vpermpd $238, %ymm0, %ymm0
vshufpd $12, %ymm3, %ymm14, %ymm3
vshufpd $12, %ymm0, %ymm10, %ymm0
vpermpd $68, %ymm15, %ymm14
vpermpd $68, %ymm6, %ymm10
vpermpd $238, %ymm15, %ymm15
vpermpd $238, %ymm6, %ymm6
vshufpd $12, %ymm10, %ymm14, %ymm10
vshufpd $12, %ymm6, %ymm15, %ymm15
vpermpd $68, %ymm7, %ymm14
vpermpd $68, %ymm12, %ymm6
vpermpd $238, %ymm7, %ymm7
vpermpd $238, %ymm12, %ymm12
vshufpd $12, %ymm6, %ymm14, %ymm6
vshufpd $12, %ymm12, %ymm7, %ymm7
vpermpd $68, %ymm10, %ymm14
vpermpd $68, %ymm9, %ymm12
vpermpd $238, %ymm10, %ymm10
vpermpd $238, %ymm9, %ymm9
vshufpd $12, %ymm10, %ymm9, %ymm9
vpermpd $68, %ymm15, %ymm10
vmovupd %ymm9, -224(%rax)
vpermpd $238, %ymm15, %ymm15
vpermpd $68, %ymm8, %ymm9
vpermpd $238, %ymm8, %ymm8
vshufpd $12, %ymm10, %ymm9, %ymm9
vshufpd $12, %ymm15, %ymm8, %ymm8
vmovupd %ymm9, -192(%rax)
vmovupd %ymm8, -160(%rax)
vpermpd $68, %ymm6, %ymm9
vpermpd $68, %ymm3, %ymm8
vpermpd $238, %ymm6, %ymm6
vpermpd $238, %ymm3, %ymm3
vshufpd $12, %ymm6, %ymm3, %ymm3
vpermpd $68, %ymm7, %ymm6
vmovupd %ymm3, -96(%rax)
vpermpd $238, %ymm7, %ymm7
vpermpd $68, %ymm0, %ymm3
vpermpd $238, %ymm0, %ymm0
vshufpd $12, %ymm7, %ymm0, %ymm0
vmovupd 960(%rsp), %ymm7
vshufpd $12, %ymm6, %ymm3, %ymm3
vshufpd $12, %ymm14, %ymm12, %ymm12
vmovupd %ymm3, -64(%rax)
vpermpd $238, %ymm7, %ymm14
vpermpd $68, %ymm7, %ymm3
vmovupd 512(%rsp), %ymm7
vmovupd %ymm0, -32(%rax)
vaddpd 896(%rsp), %ymm1, %ymm1
vpermpd $68, %ymm13, %ymm0
vshufpd $12, %ymm0, %ymm3, %ymm3
vpermpd $68, %ymm7, %ymm6
vpermpd $68, %ymm11, %ymm0
vshufpd $12, %ymm9, %ymm8, %ymm8
vshufpd $12, %ymm0, %ymm6, %ymm6
vmovupd %ymm8, -128(%rax)
vpermpd $238, %ymm7, %ymm9
vpermpd $68, %ymm4, %ymm0
vpermpd $68, %ymm5, %ymm8
vpermpd $238, %ymm11, %ymm11
vshufpd $12, %ymm11, %ymm9, %ymm11
vshufpd $12, %ymm0, %ymm8, %ymm8
vpermpd $68, %ymm2, %ymm9
vpermpd $68, %ymm1, %ymm0
vshufpd $12, %ymm0, %ymm9, %ymm9
vpermpd $238, %ymm5, %ymm5
vpermpd $68, %ymm8, %ymm0
vpermpd $68, %ymm3, %ymm7
vpermpd $238, %ymm13, %ymm13
vpermpd $238, %ymm4, %ymm4
vshufpd $12, %ymm4, %ymm5, %ymm4
vshufpd $12, %ymm0, %ymm7, %ymm7
vpermpd $238, %ymm2, %ymm2
vpermpd $68, %ymm4, %ymm0
vshufpd $12, %ymm13, %ymm14, %ymm13
vpermpd $238, %ymm4, %ymm4
vpermpd $68, %ymm13, %ymm10
vpermpd $238, %ymm1, %ymm1
vpermpd $238, %ymm13, %ymm13
vshufpd $12, %ymm1, %ymm2, %ymm1
vshufpd $12, %ymm0, %ymm10, %ymm10
vpermpd $68, %ymm9, %ymm2
vshufpd $12, %ymm4, %ymm13, %ymm0
vpermpd $68, %ymm6, %ymm4
vshufpd $12, %ymm2, %ymm4, %ymm4
vpermpd $238, %ymm8, %ymm8
vpermpd $68, %ymm1, %ymm2
vpermpd $68, %ymm11, %ymm5
vpermpd $238, %ymm3, %ymm3
vshufpd $12, %ymm8, %ymm3, %ymm3
vshufpd $12, %ymm2, %ymm5, %ymm5
vpermpd $68, %ymm4, %ymm8
vpermpd $68, %ymm7, %ymm2
vpermpd $238, %ymm4, %ymm4
vpermpd $238, %ymm6, %ymm6
vpermpd $238, %ymm9, %ymm9
vpermpd $238, %ymm7, %ymm7
vmovupd %ymm12, -256(%rax)
vshufpd $12, %ymm9, %ymm6, %ymm6
vshufpd $12, %ymm8, %ymm2, %ymm2
vshufpd $12, %ymm4, %ymm7, %ymm7
vmovupd %ymm2, -256(%r11)
vpermpd $68, %ymm6, %ymm4
vpermpd $68, %ymm3, %ymm2
vpermpd $238, %ymm6, %ymm6
vpermpd $238, %ymm3, %ymm3
vshufpd $12, %ymm4, %ymm2, %ymm2
vshufpd $12, %ymm6, %ymm3, %ymm3
vmovupd %ymm2, -192(%r11)
vmovupd %ymm3, -160(%r11)
vpermpd $68, %ymm10, %ymm2
vpermpd $68, %ymm5, %ymm3
vpermpd $238, %ymm11, %ymm11
vpermpd $238, %ymm1, %ymm1
vshufpd $12, %ymm3, %ymm2, %ymm2
vshufpd $12, %ymm1, %ymm11, %ymm1
vmovupd %ymm2, -128(%r11)
vpermpd $68, %ymm1, %ymm3
vpermpd $68, %ymm0, %ymm2
vpermpd $238, %ymm10, %ymm10
vpermpd $238, %ymm5, %ymm5
vpermpd $238, %ymm0, %ymm0
vpermpd $238, %ymm1, %ymm1
vshufpd $12, %ymm5, %ymm10, %ymm5
vshufpd $12, %ymm3, %ymm2, %ymm2
vshufpd $12, %ymm1, %ymm0, %ymm1
vmovupd %ymm7, -224(%r11)
vmovupd %ymm5, -96(%r11)
vmovupd %ymm2, -64(%r11)
vmovupd %ymm1, -32(%r11)
cmpq %r9, %rdi
jne .L5
That's almost 7 times slower than -O3, 2.4 times slower than scalar code,
generated by -O2 and twice slower than clang -Ofast.
Being twice slower than clang is not a small fit.
I knew about this bug several weeks ago, but somehow didn't realize that 11.0
is so near, so was lazy to report at time.
Now I am sorry.
Sources and compilation scripts for bigger, more real-world testbench here:
https://github.com/already5chosen/others/tree/master/cholesky_solver/gcc-badopt-aosoa-caxpy2x2
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug target/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
@ 2020-11-16 7:21 ` rguenth at gcc dot gnu.org
2020-11-16 11:11 ` rguenth at gcc dot gnu.org
` (25 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-16 7:21 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Assignee|unassigned at gcc dot gnu.org |rguenth at gcc dot gnu.org
CC| |rguenth at gcc dot gnu.org
Target|x86-64 |x86_64-*-* i?86-*-*
Keywords| |missed-optimization
Blocks| |53947
Last reconfirmed| |2020-11-16
Ever confirmed|0 |1
Status|UNCONFIRMED |ASSIGNED
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Let me do some initial analysis.
Referenced Bugs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug target/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
2020-11-16 7:21 ` [Bug target/97832] " rguenth at gcc dot gnu.org
@ 2020-11-16 11:11 ` rguenth at gcc dot gnu.org
2020-11-16 20:11 ` already5chosen at yahoo dot com
` (24 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-16 11:11 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
It's again reassociation making a mess out of the natural SLP opportunity (and
thus SLP discovery fails miserably).
One idea worth playing with would be to change reassociation to rank references
from the same load group (as later vectorization would discover) the same.
That said, further analysis and maybe a smaller testcase to look at is useful
here. There is, after all, the opportunity to turn "bad" association at the
source level to good for vectorization when -ffast-math is enabled as well.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug target/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
2020-11-16 7:21 ` [Bug target/97832] " rguenth at gcc dot gnu.org
2020-11-16 11:11 ` rguenth at gcc dot gnu.org
@ 2020-11-16 20:11 ` already5chosen at yahoo dot com
2020-11-17 9:21 ` [Bug tree-optimization/97832] " rguenth at gcc dot gnu.org
` (23 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: already5chosen at yahoo dot com @ 2020-11-16 20:11 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #3 from Michael_S <already5chosen at yahoo dot com> ---
(In reply to Richard Biener from comment #2)
> It's again reassociation making a mess out of the natural SLP opportunity
> (and thus SLP discovery fails miserably).
>
> One idea worth playing with would be to change reassociation to rank
> references
> from the same load group (as later vectorization would discover) the same.
>
> That said, further analysis and maybe a smaller testcase to look at is useful
> here. There is, after all, the opportunity to turn "bad" association at the
> source level to good for vectorization when -ffast-math is enabled as well.
It turned out, much simpler kernel suffers from the same problem.
void foo1x1(double* restrict y, const double* restrict x, int clen)
{
int xi = clen & 2;
double f_re = x[0+xi+0];
double f_im = x[4+xi+0];
int clen2 = (clen+xi) * 2;
#pragma GCC unroll 0
for (int c = 0; c < clen2; c += 8) {
// y[c] = y[c] - x[c]*conj(f);
#pragma GCC unroll 4
for (int k = 0; k < 4; ++k) {
double x_re = x[c+0+k];
double x_im = x[c+4+k];
double y_re = y[c+0+k];
double y_im = y[c+4+k];
y_re = y_re - x_re * f_re - x_im * f_im;;
y_im = y_im + x_re * f_im - x_im * f_re;
y[c+0+k] = y_re;
y[c+4+k] = y_im;
}
}
}
May be, it's possible to simplify further, but probably not by much.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (2 preceding siblings ...)
2020-11-16 20:11 ` already5chosen at yahoo dot com
@ 2020-11-17 9:21 ` rguenth at gcc dot gnu.org
2020-11-17 10:18 ` rguenth at gcc dot gnu.org
` (22 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-17 9:21 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Component|target |tree-optimization
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
Ah, thanks - that helps. So we're re-associating from
*_89 = (((*_89) - (f_re_34 * x_re_82)) - (f_im_35 * x_im_88));
*_91 = (((*_91) + (f_im_35 * x_re_82)) - (f_re_34 * x_im_88));
to
*_89 = ((*_89) - ((f_re_34 * x_re_82) + (f_im_35 * x_im_88)));
*_91 = (((*_91) + (f_im_35 * x_re_82)) - (f_re_34 * x_im_88));
that makes the operations unbalanced. This is (a - b) - c -> a - (b + c)
as we're optimizing this as a + -b + -c.
Even smaller testcase:
double a[1024], b[1024], c[1024];
void foo()
{
for (int i = 0; i < 256; ++i)
{
a[2*i] = a[2*i] + b[2*i] - c[2*i];
a[2*i+1] = a[2*i+1] - b[2*i+1] - c[2*i+1];
}
}
here ranks end up associating the expr as (-b + -c) + a and negate
re-propagation goes (-b - c) + a -> -(b + c) + a -> a - (b + c)
which is all sensible in isolation.
You could say that associating as (-b + -c) + a is worse than
(a + -b) + -c in this respect. Ranks are
Rank for _8 is 327683 (a)
Rank for _13 is 327684 (-b)
Rank for _21 is 327684 (-c)
where the rank is one more for the negated values because of the
negate operation. While heuristically ignoring negates for rank
propagation to make all ranks equal helps this new testcase it
doesn't help for the larger two.
It might still be a generally sound heuristic improvement though.
For the effects on vectorization I think we need to do sth in the
vectorizer itself, for example linearizing expressions. The
first reassoc pass is supposed to do this but then negate
re-propagation undoes it in this case - which maybe points to
it that needs fixing, somehow associating a not negated operand
first.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (3 preceding siblings ...)
2020-11-17 9:21 ` [Bug tree-optimization/97832] " rguenth at gcc dot gnu.org
@ 2020-11-17 10:18 ` rguenth at gcc dot gnu.org
2020-11-18 8:53 ` rguenth at gcc dot gnu.org
` (21 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-17 10:18 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
OK, so I have a patch to keep the association linear which IMHO is good. It
fixes the smaller and my testcase but not the original one which now is
linear but still not homogenous. The store groups are as follows
*_115 = (((((*_115) - (f00_re_68 * x0_re_108)) - (f10_re_70 * x1_re_140)) -
(f00_im_73 * x0_im_114)) - (f10_im_74 * x1_im_142));
*_117 = (((((*_117) + (f00_im_73 * x0_re_108)) + (f10_im_74 * x1_re_140)) -
(f00_re_68 * x0_im_114)) - (f10_re_70 * x1_im_142));
*_119 = (((((*_119) - (f01_re_71 * x0_re_108)) - (f11_re_72 * x1_re_140)) -
(f01_im_75 * x0_im_114)) - (f11_im_76 * x1_im_142));
*_121 = (((((*_121) + (f01_im_75 * x0_re_108)) + (f11_im_76 * x1_re_140)) -
(f01_re_71 * x0_im_114)) - (f11_re_72 * x1_im_142));
(good)
*_177 = (((((*_177) - (f00_re_68 * x0_re_170)) - (f00_im_73 * x0_im_176)) -
(f10_re_70 * x1_re_202)) - (f10_im_74 * x1_im_204));
*_179 = (((((f00_im_73 * x0_re_170) + (f10_im_74 * x1_re_202)) + (*_179)) -
(f00_re_68 * x0_im_176)) - (f10_re_70 * x1_im_204));
*_181 = (((((*_181) - (f01_re_71 * x0_re_170)) - (f01_im_75 * x0_im_176)) -
(f11_re_72 * x1_re_202)) - (f11_im_76 * x1_im_204));
*_183 = (((((f01_im_75 * x0_re_170) + (f11_im_76 * x1_re_202)) + (*_183)) -
(f01_re_71 * x0_im_176)) - (f11_re_72 * x1_im_204));
already bad. Now, this is sth to tackle in the vectorizer which ideally
should not try to match up individual adds during SLP discoverly but
instead (if association is allowed) the whole addition chain, commutating
within the whole change rather than just swapping individual add operands.
I still think the reassoc change I came up with is good since it avoids
the need to linearlize in the vectorizer. So testing that now.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (4 preceding siblings ...)
2020-11-17 10:18 ` rguenth at gcc dot gnu.org
@ 2020-11-18 8:53 ` rguenth at gcc dot gnu.org
2020-11-18 9:15 ` rguenth at gcc dot gnu.org
` (20 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-18 8:53 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
So for example we'd like to vectorize with SLP when reassociation is permitted
(thus with -Ofast for example):
double a[1024], b[1024], c[1024];
void foo()
{
for (int i = 0; i < 256; ++i)
{
a[2*i] = 1. - a[2*i] + b[2*i];
a[2*i+1] = a[2*i+1] + b[2*i+1] + 1.;
}
}
it again works when written as follows and with -fno-tree-reassoc
double a[1024], b[1024], c[1024];
void foo()
{
for (int i = 0; i < 256; ++i)
{
a[2*i] = 1. - a[2*i] + b[2*i];
a[2*i+1] = 1 + a[2*i+1] + b[2*i+1];
}
}
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (5 preceding siblings ...)
2020-11-18 8:53 ` rguenth at gcc dot gnu.org
@ 2020-11-18 9:15 ` rguenth at gcc dot gnu.org
2020-11-18 13:23 ` rguenth at gcc dot gnu.org
` (19 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-18 9:15 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #7 from Richard Biener <rguenth at gcc dot gnu.org> ---
Or
double a[1024], b[1024], c[1024];
void foo()
{
for (int i = 0; i < 256; ++i)
{
a[2*i] = 1. - a[2*i] + b[2*i];
a[2*i+1] = 1 + a[2*i+1] - b[2*i+1];
}
}
which early folding breaks unless we add -fno-associative-math. We then
end up with
a[_1] = (((b[_1]) - (a[_1])) + 1.0e+0);
a[_6] = (((a[_6]) - (b[_6])) + 1.0e+0);
where SLP operator swaping cannot handle to bring the grouped loads into
the same lanes.
So the idea is to look at single-use chains of plus/minus operations and
handle those as wide associated SLP nodes with flags denoting which lanes
need negation. We'd have three children and each child has a per-lane
spec whether to add or subtract.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (6 preceding siblings ...)
2020-11-18 9:15 ` rguenth at gcc dot gnu.org
@ 2020-11-18 13:23 ` rguenth at gcc dot gnu.org
2020-11-18 13:39 ` rguenth at gcc dot gnu.org
` (18 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-18 13:23 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #8 from Richard Biener <rguenth at gcc dot gnu.org> ---
Created attachment 49586
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=49586&action=edit
prototype
This is a prototype patch which can serve as proof-of-concept. It needs
cleanup plus better handling of hybrid SLP discovery.
It depends on
https://gcc.gnu.org/pipermail/gcc-patches/2020-November/559347.html to fix the
testcase in this PR (which is included in the patch).
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (7 preceding siblings ...)
2020-11-18 13:23 ` rguenth at gcc dot gnu.org
@ 2020-11-18 13:39 ` rguenth at gcc dot gnu.org
2020-11-19 19:55 ` already5chosen at yahoo dot com
` (17 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-18 13:39 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #9 from Richard Biener <rguenth at gcc dot gnu.org> ---
There's then also a permute optimization left on the plate:
t.c:16:3: note: node 0x3a19590 (max_nunits=4, refcnt=2)
t.c:16:3: note: stmt 0 _153 = f11_im_76 * x1_im_142;
t.c:16:3: note: stmt 1 _213 = f11_re_72 * x1_re_202;
t.c:16:3: note: stmt 2 _275 = f11_re_72 * x1_re_264;
t.c:16:3: note: stmt 3 _337 = f11_re_72 * x1_re_326;
t.c:16:3: note: stmt 4 _155 = f11_im_76 * x1_re_140;
t.c:16:3: note: stmt 5 _217 = f11_im_76 * x1_re_202;
t.c:16:3: note: stmt 6 _279 = f11_im_76 * x1_re_264;
t.c:16:3: note: stmt 7 _341 = f11_im_76 * x1_re_326;
t.c:16:3: note: children 0x3a19600 0x3a19670
t.c:16:3: note: node (external) 0x3a19600 (max_nunits=1, refcnt=1)
t.c:16:3: note: { f11_im_76, f11_re_72, f11_re_72, f11_re_72,
f11_im_76, f11_im_76, f11_im_76, f11_im_76 }
t.c:16:3: note: node 0x3a19670 (max_nunits=4, refcnt=1)
t.c:16:3: note: stmt 0 x1_im_142 = *_141;
t.c:16:3: note: stmt 1 x1_re_202 = *_201;
t.c:16:3: note: stmt 2 x1_re_264 = *_263;
t.c:16:3: note: stmt 3 x1_re_326 = *_325;
t.c:16:3: note: stmt 4 x1_re_140 = *_139;
t.c:16:3: note: stmt 5 x1_re_202 = *_201;
t.c:16:3: note: stmt 6 x1_re_264 = *_263;
t.c:16:3: note: stmt 7 x1_re_326 = *_325;
t.c:16:3: note: load permutation { 4 1 2 3 0 1 2 3 }
which we currently do not handle (there's a FIXME as to permute externals,
currently we only handle splats as transparent for permutes).
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (8 preceding siblings ...)
2020-11-18 13:39 ` rguenth at gcc dot gnu.org
@ 2020-11-19 19:55 ` already5chosen at yahoo dot com
2020-11-20 7:10 ` rguenth at gcc dot gnu.org
` (16 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: already5chosen at yahoo dot com @ 2020-11-19 19:55 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #10 from Michael_S <already5chosen at yahoo dot com> ---
I lost track of what you're talking about long time ago.
But that's o.k.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (9 preceding siblings ...)
2020-11-19 19:55 ` already5chosen at yahoo dot com
@ 2020-11-20 7:10 ` rguenth at gcc dot gnu.org
2021-06-09 12:41 ` cvs-commit at gcc dot gnu.org
` (15 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2020-11-20 7:10 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #11 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Michael_S from comment #10)
> I lost track of what you're talking about long time ago.
> But that's o.k.
No problem - difficult PRs tend to be used as media to brain-dump and record
work progress.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (10 preceding siblings ...)
2020-11-20 7:10 ` rguenth at gcc dot gnu.org
@ 2021-06-09 12:41 ` cvs-commit at gcc dot gnu.org
2021-06-09 12:54 ` rguenth at gcc dot gnu.org
` (14 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2021-06-09 12:41 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #12 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:
https://gcc.gnu.org/g:ce670e4faafb296d1f1a7828d20f8c8ba4686797
commit r12-1329-gce670e4faafb296d1f1a7828d20f8c8ba4686797
Author: Richard Biener <rguenther@suse.de>
Date: Wed Nov 18 14:17:34 2020 +0100
tree-optimization/97832 - handle associatable chains in SLP discovery
This makes SLP discovery handle associatable (including mixed
plus/minus) chains better by swapping operands across the whole
chain. To work this adds caching of the 'matches' lanes for
failed SLP discovery attempts, thereby fixing a failed SLP
discovery for the slp-pr98855.cc testcase which results in
building an operand from scalars as expected. Unfortunately
this makes us trip over the cost threshold so I'm XFAILing the
testcase for now.
For BB vectorization all this doesn't work because we have no way
to distinguish good from bad associations as we eventually build
operands from scalars and thus not fail in the classical sense.
2021-05-31 Richard Biener <rguenther@suse.de>
PR tree-optimization/97832
* tree-vectorizer.h (_slp_tree::failed): New.
* tree-vect-slp.c (_slp_tree::_slp_tree): Initialize
failed member.
(_slp_tree::~_slp_tree): Free failed.
(vect_build_slp_tree): Retain failed nodes and record
matches in them, copying that back out when running
into a cached fail. Dump start and end of discovery.
(dt_sort_cmp): New.
(vect_build_slp_tree_2): Handle associatable chains
together doing more aggressive operand swapping.
* gcc.dg/vect/pr97832-1.c: New testcase.
* gcc.dg/vect/pr97832-2.c: Likewise.
* gcc.dg/vect/pr97832-3.c: Likewise.
* g++.dg/vect/slp-pr98855.cc: XFAIL.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (11 preceding siblings ...)
2021-06-09 12:41 ` cvs-commit at gcc dot gnu.org
@ 2021-06-09 12:54 ` rguenth at gcc dot gnu.org
2022-01-21 0:16 ` pinskia at gcc dot gnu.org
` (13 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-06-09 12:54 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Resolution|--- |FIXED
Status|ASSIGNED |RESOLVED
Known to work| |12.0
--- Comment #13 from Richard Biener <rguenth at gcc dot gnu.org> ---
Fixed (hopefully), for GCC 12.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (12 preceding siblings ...)
2021-06-09 12:54 ` rguenth at gcc dot gnu.org
@ 2022-01-21 0:16 ` pinskia at gcc dot gnu.org
2022-11-24 23:22 ` already5chosen at yahoo dot com
` (12 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: pinskia at gcc dot gnu.org @ 2022-01-21 0:16 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Target Milestone|--- |12.0
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (13 preceding siblings ...)
2022-01-21 0:16 ` pinskia at gcc dot gnu.org
@ 2022-11-24 23:22 ` already5chosen at yahoo dot com
2022-11-25 8:16 ` rguenth at gcc dot gnu.org
` (11 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: already5chosen at yahoo dot com @ 2022-11-24 23:22 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #14 from Michael_S <already5chosen at yahoo dot com> ---
I tested a smaller test bench from Comment 3 with gcc trunk on godbolt.
Issue appears to be only partially fixed.
-Ofast result is no longer a horror that it was before, but it is still not as
good as -O3 or -O2. -Ofast code generation is still strange and there are few
vblendpd instruction that serve no useful purpose.
And -O2/O3 is still not as good as it should be or as good as icc.
But, as mentioned in my original post, over-aggressive load+op combining is a
separate problem.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (14 preceding siblings ...)
2022-11-24 23:22 ` already5chosen at yahoo dot com
@ 2022-11-25 8:16 ` rguenth at gcc dot gnu.org
2022-11-25 13:19 ` already5chosen at yahoo dot com
` (10 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-11-25 8:16 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #15 from Richard Biener <rguenth at gcc dot gnu.org> ---
I can confirm we get
.L3:
vmovupd (%rsi), %ymm1
vmovupd 32(%rsi), %ymm0
addl $1, %eax
addq $64, %rdi
addq $64, %rsi
vblendpd $14, %ymm1, %ymm0, %ymm3
vblendpd $14, %ymm0, %ymm1, %ymm2
vfnmadd213pd -64(%rdi), %ymm5, %ymm3
vfmadd213pd -32(%rdi), %ymm7, %ymm1
vfnmadd132pd %ymm4, %ymm3, %ymm2
vfnmadd132pd %ymm6, %ymm1, %ymm0
vmovupd %ymm2, -64(%rdi)
vmovupd %ymm0, -32(%rdi)
cmpl %edx, %eax
jb .L3
instead of
.L3:
vmovupd (%rdx), %ymm1
vmovupd (%rdx), %ymm0
addl $1, %ecx
addq $64, %rax
vfmadd213pd -32(%rax), %ymm3, %ymm1
vfnmadd213pd -64(%rax), %ymm2, %ymm0
addq $64, %rdx
vfnmadd231pd -32(%rdx), %ymm3, %ymm0
vfnmadd231pd -32(%rdx), %ymm2, %ymm1
vmovupd %ymm0, -64(%rax)
vmovupd %ymm1, -32(%rax)
cmpl %esi, %ecx
jb .L3
the good case sees
<bb 4> [local count: 214748368]:
# ivtmp.27_211 = PHI <ivtmp.27_210(4), 0(3)>
# ivtmp.32_209 = PHI <ivtmp.32_208(4), ivtmp.32_212(3)>
# ivtmp.34_28 = PHI <ivtmp.34_51(4), ivtmp.34_52(3)>
_53 = (void *) ivtmp.34_28;
vect_x_re_54.13_193 = MEM <const vector(4) double> [(const double *)_53];
vect_x_im_60.21_176 = MEM <const vector(4) double> [(const double *)_53 +
32B];
_54 = (void *) ivtmp.32_209;
vect_y_re_62.9_200 = MEM <vector(4) double> [(double *)_54];
vect_y_re_62.10_198 = MEM <vector(4) double> [(double *)_54 + 32B];
vect__154.17_185 = .FMA (vect_x_re_54.13_193, _197, vect_y_re_62.10_198);
vect__66.16_188 = .FNMA (vect_x_re_54.13_193, _196, vect_y_re_62.9_200);
vect_y_re_68.23_173 = .FNMA (vect_x_im_60.21_176, _197, vect__66.16_188);
vect_y_re_68.23_172 = .FNMA (vect_x_im_60.21_176, _196, vect__154.17_185);
MEM <vector(4) double> [(double *)_54] = vect_y_re_68.23_173;
MEM <vector(4) double> [(double *)_54 + 32B] = vect_y_re_68.23_172;
ivtmp.27_210 = ivtmp.27_211 + 1;
ivtmp.32_208 = ivtmp.32_209 + 64;
ivtmp.34_51 = ivtmp.34_28 + 64;
if (bnd.6_207 > ivtmp.27_210)
goto <bb 4>; [90.00%]
while the bad has
<bb 4> [local count: 214748368]:
# ivtmp.31_65 = PHI <ivtmp.31_64(4), 0(3)>
# ivtmp.36_63 = PHI <ivtmp.36_62(4), ivtmp.36_204(3)>
# ivtmp.38_203 = PHI <ivtmp.38_59(4), ivtmp.38_60(3)>
_61 = (void *) ivtmp.38_203;
vect_x_im_60.13_211 = MEM <const vector(4) double> [(const double *)_61];
vect_x_im_60.14_209 = MEM <const vector(4) double> [(const double *)_61 +
32B];
vect_x_re_54.15_208 = VEC_PERM_EXPR <vect_x_im_60.14_209,
vect_x_im_60.13_211, { 0, 5, 6, 7 }>;
vect_x_re_54.23_192 = VEC_PERM_EXPR <vect_x_im_60.13_211,
vect_x_im_60.14_209, { 0, 5, 6, 7 }>;
_58 = (void *) ivtmp.36_63;
vect_y_re_62.9_218 = MEM <vector(4) double> [(double *)_58];
vect_y_re_62.10_216 = MEM <vector(4) double> [(double *)_58 + 32B];
vect__41.18_202 = .FMA (vect_x_im_60.13_211, _215, vect_y_re_62.10_216);
vect_y_re_68.17_205 = .FNMA (vect_x_re_54.15_208, _214, vect_y_re_62.9_218);
vect_y_re_68.25_189 = .FNMA (vect_x_re_54.23_192, _198, vect_y_re_68.17_205);
vect_y_re_68.25_188 = .FNMA (_199, vect_x_im_60.14_209, vect__41.18_202);
MEM <vector(4) double> [(double *)_58] = vect_y_re_68.25_189;
MEM <vector(4) double> [(double *)_58 + 32B] = vect_y_re_68.25_188;
ivtmp.31_64 = ivtmp.31_65 + 1;
ivtmp.36_62 = ivtmp.36_63 + 64;
ivtmp.38_59 = ivtmp.38_203 + 64;
if (ivtmp.31_64 < bnd.6_225)
goto <bb 4>; [90.00%]
the blends do not look like no-ops so I wonder if this is really computing
the same thing ... (it swaps lane 0 from the two loads from x but not the
stores)
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (15 preceding siblings ...)
2022-11-25 8:16 ` rguenth at gcc dot gnu.org
@ 2022-11-25 13:19 ` already5chosen at yahoo dot com
2022-11-25 20:46 ` rguenth at gcc dot gnu.org
` (9 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: already5chosen at yahoo dot com @ 2022-11-25 13:19 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #16 from Michael_S <already5chosen at yahoo dot com> ---
On unrelated note, why loop overhead uses so many instructions?
Assuming that I am as misguided as gcc about load-op combining, I would write
it as:
sub %rax, %rdx
.L3:
vmovupd (%rdx,%rax), %ymm1
vmovupd 32(%rdx,%rax), %ymm0
vfmadd213pd 32(%rax), %ymm3, %ymm1
vfnmadd213pd (%rax), %ymm2, %ymm0
vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0
vfnmadd231pd (%rdx,%rax), %ymm2, %ymm1
vmovupd %ymm0, (%rax)
vmovupd %ymm1, 32(%rax)
addq $64, %rax
decl %esi
jb .L3
The loop overhead in my variant is 3 x86 instructions==2 macro-ops,
vs 5 x86 instructions==4 macro-ops in gcc variant.
Also, in gcc variant all memory accesses have displacement that makes them
1 byte longer. In my variant only half of accesses have displacement.
I think, in the past I had seen cases where gcc generates optimal or
near-optimal
code sequences for loop overhead. I wonder why it can not do it here.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (16 preceding siblings ...)
2022-11-25 13:19 ` already5chosen at yahoo dot com
@ 2022-11-25 20:46 ` rguenth at gcc dot gnu.org
2022-11-25 21:27 ` amonakov at gcc dot gnu.org
` (8 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-11-25 20:46 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #17 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Michael_S from comment #16)
> On unrelated note, why loop overhead uses so many instructions?
> Assuming that I am as misguided as gcc about load-op combining, I would
> write it as:
> sub %rax, %rdx
> .L3:
> vmovupd (%rdx,%rax), %ymm1
> vmovupd 32(%rdx,%rax), %ymm0
> vfmadd213pd 32(%rax), %ymm3, %ymm1
> vfnmadd213pd (%rax), %ymm2, %ymm0
> vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0
> vfnmadd231pd (%rdx,%rax), %ymm2, %ymm1
> vmovupd %ymm0, (%rax)
> vmovupd %ymm1, 32(%rax)
> addq $64, %rax
> decl %esi
> jb .L3
>
> The loop overhead in my variant is 3 x86 instructions==2 macro-ops,
> vs 5 x86 instructions==4 macro-ops in gcc variant.
> Also, in gcc variant all memory accesses have displacement that makes them
> 1 byte longer. In my variant only half of accesses have displacement.
>
> I think, in the past I had seen cases where gcc generates optimal or
> near-optimal
> code sequences for loop overhead. I wonder why it can not do it here.
I don't think we currently consider IVs based on the difference of two
addresses. The cost benefit of no displacement is only size, otherwise
I have no idea why we have biased the %rax accesses by -32. Why we
fail to consider decrement-to-zero for the counter IV is probably because
IVCANON would add such IV but the vectorizer replaces that and IVOPTs
doesn't consider re-adding that.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (17 preceding siblings ...)
2022-11-25 20:46 ` rguenth at gcc dot gnu.org
@ 2022-11-25 21:27 ` amonakov at gcc dot gnu.org
2022-11-26 18:27 ` already5chosen at yahoo dot com
` (7 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-11-25 21:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
Alexander Monakov <amonakov at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |amonakov at gcc dot gnu.org
--- Comment #18 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
The apparent 'bias' is introduced by instruction scheduling: haifa-sched lifts
a +64 increment over memory accesses, transforming +0 and +32 displacements to
-64 and -32. Sometimes this helps a little bit even on modern x86 CPUs.
Also note that 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' would be
'unlaminated' (turned to 2 uops before renaming), so selecting independent IVs
for the two arrays actually helps on this testcase.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (18 preceding siblings ...)
2022-11-25 21:27 ` amonakov at gcc dot gnu.org
@ 2022-11-26 18:27 ` already5chosen at yahoo dot com
2022-11-26 18:36 ` already5chosen at yahoo dot com
` (6 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: already5chosen at yahoo dot com @ 2022-11-26 18:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #19 from Michael_S <already5chosen at yahoo dot com> ---
(In reply to Alexander Monakov from comment #18)
> The apparent 'bias' is introduced by instruction scheduling: haifa-sched
> lifts a +64 increment over memory accesses, transforming +0 and +32
> displacements to -64 and -32. Sometimes this helps a little bit even on
> modern x86 CPUs.
I don't think that it ever helps on Intel Sandy Bridge or later or on AMD Zen1
or later.
>
> Also note that 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' would be
> 'unlaminated' (turned to 2 uops before renaming), so selecting independent
> IVs for the two arrays actually helps on this testcase.
Both 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' and 'vfnmadd231pd 32(%rdx),
%ymm3, %ymm0' would be turned into 2 uops.
Misuse of load+op is far bigger problem in this particular test case than
sub-optimal loop overhead. Assuming execution on Intel Skylake, it turns loop
that can potentially run at 3 clocks per iteration into loop of 4+ clocks per
iteration.
But I consider it a separate issue. I reported similar issue in 97127, but here
it is more serious. It looks to me that the issue is not soluble within
existing gcc optimization framework. The only chance is if you accept my old
and simple advice - within inner loops pretend that AVX is RISC, i.e. generate
code as if load-op form of AVX instructions weren't existing.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (19 preceding siblings ...)
2022-11-26 18:27 ` already5chosen at yahoo dot com
@ 2022-11-26 18:36 ` already5chosen at yahoo dot com
2022-11-26 19:36 ` amonakov at gcc dot gnu.org
` (5 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: already5chosen at yahoo dot com @ 2022-11-26 18:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #20 from Michael_S <already5chosen at yahoo dot com> ---
(In reply to Richard Biener from comment #17)
> (In reply to Michael_S from comment #16)
> > On unrelated note, why loop overhead uses so many instructions?
> > Assuming that I am as misguided as gcc about load-op combining, I would
> > write it as:
> > sub %rax, %rdx
> > .L3:
> > vmovupd (%rdx,%rax), %ymm1
> > vmovupd 32(%rdx,%rax), %ymm0
> > vfmadd213pd 32(%rax), %ymm3, %ymm1
> > vfnmadd213pd (%rax), %ymm2, %ymm0
> > vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0
> > vfnmadd231pd (%rdx,%rax), %ymm2, %ymm1
> > vmovupd %ymm0, (%rax)
> > vmovupd %ymm1, 32(%rax)
> > addq $64, %rax
> > decl %esi
> > jb .L3
> >
> > The loop overhead in my variant is 3 x86 instructions==2 macro-ops,
> > vs 5 x86 instructions==4 macro-ops in gcc variant.
> > Also, in gcc variant all memory accesses have displacement that makes them
> > 1 byte longer. In my variant only half of accesses have displacement.
> >
> > I think, in the past I had seen cases where gcc generates optimal or
> > near-optimal
> > code sequences for loop overhead. I wonder why it can not do it here.
>
> I don't think we currently consider IVs based on the difference of two
> addresses.
It seems to me that I had seen you doing it.
But, may be, I confuse gcc with clang.
> The cost benefit of no displacement is only size,
Size is pretty important in high-IPC SIMD loops. Esp. on Intel and when # of
iterations is small, because Intel has 16-byte fetch out of L1I cache. SIMD
instructions tend to be long and not many instructions fit within 16 bytes even
when memory accesses have no offsets. Offset adds impact to the injury.
> otherwise
> I have no idea why we have biased the %rax accesses by -32. Why we
> fail to consider decrement-to-zero for the counter IV is probably because
> IVCANON would add such IV but the vectorizer replaces that and IVOPTs
> doesn't consider re-adding that.
Sorry, I have no idea about the meaning of IVCANON.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (20 preceding siblings ...)
2022-11-26 18:36 ` already5chosen at yahoo dot com
@ 2022-11-26 19:36 ` amonakov at gcc dot gnu.org
2022-11-26 22:00 ` already5chosen at yahoo dot com
` (4 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: amonakov at gcc dot gnu.org @ 2022-11-26 19:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #21 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
(In reply to Michael_S from comment #19)
> > Also note that 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' would be
> > 'unlaminated' (turned to 2 uops before renaming), so selecting independent
> > IVs for the two arrays actually helps on this testcase.
>
> Both 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' and 'vfnmadd231pd 32(%rdx),
> %ymm3, %ymm0' would be turned into 2 uops.
The difference is at which point in the pipeline. The latter goes through
renaming as one fused uop.
> Misuse of load+op is far bigger problem in this particular test case than
> sub-optimal loop overhead. Assuming execution on Intel Skylake, it turns
> loop that can potentially run at 3 clocks per iteration into loop of 4+
> clocks per iteration.
Sorry, which assembler output this refers to?
> But I consider it a separate issue. I reported similar issue in 97127, but
> here it is more serious. It looks to me that the issue is not soluble within
> existing gcc optimization framework. The only chance is if you accept my old
> and simple advice - within inner loops pretend that AVX is RISC, i.e.
> generate code as if load-op form of AVX instructions weren't existing.
In bug 97127 the best explanation we have so far is we don't optimally handle
the case where non-memory inputs of an fma are reused, so we can't combine a
load with an fma without causing an extra register copy (PR 97127 comment 16
demonstrates what I mean). I cannot imagine such trouble arising with more
common commutative operations like mul/add, especially with non-destructive VEX
encoding. If you hit such examples, I would suggest to report them also,
because their root cause might be different.
In general load-op combining should be very helpful on x86, because it reduces
the number of uops flowing through the renaming stage, which is one of the
narrowest points in the pipeline.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (21 preceding siblings ...)
2022-11-26 19:36 ` amonakov at gcc dot gnu.org
@ 2022-11-26 22:00 ` already5chosen at yahoo dot com
2022-11-28 6:29 ` crazylht at gmail dot com
` (3 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: already5chosen at yahoo dot com @ 2022-11-26 22:00 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #22 from Michael_S <already5chosen at yahoo dot com> ---
(In reply to Alexander Monakov from comment #21)
> (In reply to Michael_S from comment #19)
> > > Also note that 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' would be
> > > 'unlaminated' (turned to 2 uops before renaming), so selecting independent
> > > IVs for the two arrays actually helps on this testcase.
> >
> > Both 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' and 'vfnmadd231pd 32(%rdx),
> > %ymm3, %ymm0' would be turned into 2 uops.
>
> The difference is at which point in the pipeline. The latter goes through
> renaming as one fused uop.
>
Intel never documents such fine details in their Optimization Reference
manuals.
But I believe you.
> > Misuse of load+op is far bigger problem in this particular test case than
> > sub-optimal loop overhead. Assuming execution on Intel Skylake, it turns
> > loop that can potentially run at 3 clocks per iteration into loop of 4+
> > clocks per iteration.
>
> Sorry, which assembler output this refers to?
>
gcc12 -O3 -mavx2 -mfma
gcc12 -O3 -march=skylake does not suffer from this problem.
I still think that RISC-style icc code will be a little faster on Skylake, but
here we are arguing about 1/4th of the cycle per iteration rather than a full
cycle.
https://godbolt.org/z/nfa7c9se3
> > But I consider it a separate issue. I reported similar issue in 97127, but
> > here it is more serious. It looks to me that the issue is not soluble within
> > existing gcc optimization framework. The only chance is if you accept my old
> > and simple advice - within inner loops pretend that AVX is RISC, i.e.
> > generate code as if load-op form of AVX instructions weren't existing.
>
> In bug 97127 the best explanation we have so far is we don't optimally
> handle the case where non-memory inputs of an fma are reused, so we can't
> combine a load with an fma without causing an extra register copy (PR 97127
> comment 16 demonstrates what I mean). I cannot imagine such trouble arising
> with more common commutative operations like mul/add, especially with
> non-destructive VEX encoding. If you hit such examples, I would suggest to
> report them also, because their root cause might be different.
>
> In general load-op combining should be very helpful on x86, because it
> reduces the number of uops flowing through the renaming stage, which is one
> of the narrowest points in the pipeline.
If compilers were perfect, AVX load-op combining would be somewhat helpful. I
have my doubts about very helpful. But compilers are not perfect.
For none-AVX case, where every op is destructive and repeated loads are on
average cheaper than on AVX, combined load-ops is far more profitable.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (22 preceding siblings ...)
2022-11-26 22:00 ` already5chosen at yahoo dot com
@ 2022-11-28 6:29 ` crazylht at gmail dot com
2022-11-28 6:42 ` crazylht at gmail dot com
` (2 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: crazylht at gmail dot com @ 2022-11-28 6:29 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #23 from Hongtao.liu <crazylht at gmail dot com> ---
> the blends do not look like no-ops so I wonder if this is really computing
> the same thing ... (it swaps lane 0 from the two loads from x but not the
> stores)
They're computing the same thing since we also do the same "permutation" for
the invariants: f_re and f_imm, can we eliminate that in the vectorizer?
_232 = {f_im_36, f_im_36, f_im_36, f_im_36};
_231 = {f_im_36, f_re_35, f_re_35, f_re_35}; ------- here
_216 = {f_re_35, f_re_35, f_re_35, f_re_35};
_215 = {f_re_35, f_im_36, f_im_36, f_im_36}; ------ and here.
ivtmp.36_221 = (unsigned long) y_41(D);
ivtmp.38_61 = (unsigned long) x_33(D);
<bb 4> [local count: 214748368]:
# ivtmp.32_66 = PHI <ivtmp.32_65(4), 0(3)>
# ivtmp.36_64 = PHI <ivtmp.36_63(4), ivtmp.36_221(3)>
# ivtmp.38_220 = PHI <ivtmp.38_60(4), ivtmp.38_61(3)>
# DEBUG c => NULL
# DEBUG k => 0
# DEBUG BEGIN_STMT
# DEBUG BEGIN_STMT
# DEBUG D#78 => D#79 * 8
# DEBUG D#77 => x_33(D) + D#78
_62 = (void *) ivtmp.38_220;
vect_x_im_61.13_228 = MEM <const vector(4) double> [(const double *)_62];
vect_x_im_61.14_226 = MEM <const vector(4) double> [(const double *)_62 +
32B];
vect_x_re_55.15_225 = VEC_PERM_EXPR <vect_x_im_61.14_226,
vect_x_im_61.13_228, { 0, 5, 6, 7 }>;
vect_x_re_55.23_209 = VEC_PERM_EXPR <vect_x_im_61.13_228,
vect_x_im_61.14_226, { 0, 5, 6, 7 }>;
# DEBUG D#76 => *D#77
# DEBUG x_re => D#76
# DEBUG BEGIN_STMT
# DEBUG D#74 => (long unsigned int) D#75
# DEBUG D#73 => D#74 * 8
# DEBUG D#72 => x_33(D) + D#73
# DEBUG D#71 => *D#72
# DEBUG x_im => D#71
# DEBUG BEGIN_STMT
# DEBUG D#70 => y_41(D) + D#78
_59 = (void *) ivtmp.36_64;
vect_y_re_63.9_235 = MEM <vector(4) double> [(double *)_59];
vect_y_re_63.10_233 = MEM <vector(4) double> [(double *)_59 + 32B];
vect__42.18_219 = .FMA (vect_x_im_61.13_228, _232, vect_y_re_63.10_233);
vect_y_re_69.17_222 = .FNMA (vect_x_re_55.15_225, _231, vect_y_re_63.9_235);
vect_y_re_69.25_206 = .FNMA (vect_x_re_55.23_209, _215, vect_y_re_69.17_222);
vect_y_re_69.25_205 = .FNMA (_216, vect_x_im_61.14_226, vect__42.18_219);
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (23 preceding siblings ...)
2022-11-28 6:29 ` crazylht at gmail dot com
@ 2022-11-28 6:42 ` crazylht at gmail dot com
2022-11-28 7:21 ` rguenther at suse dot de
2022-11-28 7:24 ` crazylht at gmail dot com
26 siblings, 0 replies; 28+ messages in thread
From: crazylht at gmail dot com @ 2022-11-28 6:42 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #24 from Hongtao.liu <crazylht at gmail dot com> ---
_233 = {f_im_36, f_re_35, f_re_35, f_re_35};
_217 = {f_re_35, f_im_36, f_im_36, f_im_36};
...
vect_x_re_55.15_227 = VEC_PERM_EXPR <vect_x_im_61.14_228, vect_x_im_61.13_230,
{ 0, 5, 6, 7 }>;
vect_x_re_55.23_211 = VEC_PERM_EXPR <vect_x_im_61.13_230,
vect_x_im_61.14_228, { 0, 5, 6, 7 }>;
...
vect_y_re_69.17_224 = .FNMA (vect_x_re_55.15_227, _233, vect_y_re_63.9_237);
vect_y_re_69.25_208 = .FNMA (vect_x_re_55.23_211, _217, vect_y_re_69.17_224);
is equal to
_233 = {f_im_36,f_im_36, f_im_36, f_im_36}
_217 = {f_re_35, f_re_35, f_re_35, f_re_35};
...
vect_y_re_69.17_224 = .FNMA (vect_x_im_61.14_228, _233, vect_y_re_63.9_237)
vect_y_re_69.25_208 = .FNMA (vect_x_im_61.13_230, _217, vect_y_re_69.17_224)
A simplication in match.pd?
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (24 preceding siblings ...)
2022-11-28 6:42 ` crazylht at gmail dot com
@ 2022-11-28 7:21 ` rguenther at suse dot de
2022-11-28 7:24 ` crazylht at gmail dot com
26 siblings, 0 replies; 28+ messages in thread
From: rguenther at suse dot de @ 2022-11-28 7:21 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #25 from rguenther at suse dot de <rguenther at suse dot de> ---
On Mon, 28 Nov 2022, crazylht at gmail dot com wrote:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
>
> --- Comment #24 from Hongtao.liu <crazylht at gmail dot com> ---
> _233 = {f_im_36, f_re_35, f_re_35, f_re_35};
> _217 = {f_re_35, f_im_36, f_im_36, f_im_36};
> ...
> vect_x_re_55.15_227 = VEC_PERM_EXPR <vect_x_im_61.14_228, vect_x_im_61.13_230,
> { 0, 5, 6, 7 }>;
> vect_x_re_55.23_211 = VEC_PERM_EXPR <vect_x_im_61.13_230,
> vect_x_im_61.14_228, { 0, 5, 6, 7 }>;
> ...
> vect_y_re_69.17_224 = .FNMA (vect_x_re_55.15_227, _233, vect_y_re_63.9_237);
> vect_y_re_69.25_208 = .FNMA (vect_x_re_55.23_211, _217, vect_y_re_69.17_224);
>
> is equal to
>
> _233 = {f_im_36,f_im_36, f_im_36, f_im_36}
> _217 = {f_re_35, f_re_35, f_re_35, f_re_35};
> ...
> vect_y_re_69.17_224 = .FNMA (vect_x_im_61.14_228, _233, vect_y_re_63.9_237)
> vect_y_re_69.25_208 = .FNMA (vect_x_im_61.13_230, _217, vect_y_re_69.17_224)
>
> A simplication in match.pd?
I guess that's possible but the SLP vectorizer has a permute optimization
phase (and SLP discovery itself), it would be nice to see why the former
doesn't elide the permutes here.
^ permalink raw reply [flat|nested] 28+ messages in thread
* [Bug tree-optimization/97832] AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
` (25 preceding siblings ...)
2022-11-28 7:21 ` rguenther at suse dot de
@ 2022-11-28 7:24 ` crazylht at gmail dot com
26 siblings, 0 replies; 28+ messages in thread
From: crazylht at gmail dot com @ 2022-11-28 7:24 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832
--- Comment #26 from Hongtao.liu <crazylht at gmail dot com> ---
> I guess that's possible but the SLP vectorizer has a permute optimization
> phase (and SLP discovery itself), it would be nice to see why the former
> doesn't elide the permutes here.
I've opened PR107891 for it.
^ permalink raw reply [flat|nested] 28+ messages in thread
end of thread, other threads:[~2022-11-28 7:24 UTC | newest]
Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-14 20:44 [Bug target/97832] New: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3 already5chosen at yahoo dot com
2020-11-16 7:21 ` [Bug target/97832] " rguenth at gcc dot gnu.org
2020-11-16 11:11 ` rguenth at gcc dot gnu.org
2020-11-16 20:11 ` already5chosen at yahoo dot com
2020-11-17 9:21 ` [Bug tree-optimization/97832] " rguenth at gcc dot gnu.org
2020-11-17 10:18 ` rguenth at gcc dot gnu.org
2020-11-18 8:53 ` rguenth at gcc dot gnu.org
2020-11-18 9:15 ` rguenth at gcc dot gnu.org
2020-11-18 13:23 ` rguenth at gcc dot gnu.org
2020-11-18 13:39 ` rguenth at gcc dot gnu.org
2020-11-19 19:55 ` already5chosen at yahoo dot com
2020-11-20 7:10 ` rguenth at gcc dot gnu.org
2021-06-09 12:41 ` cvs-commit at gcc dot gnu.org
2021-06-09 12:54 ` rguenth at gcc dot gnu.org
2022-01-21 0:16 ` pinskia at gcc dot gnu.org
2022-11-24 23:22 ` already5chosen at yahoo dot com
2022-11-25 8:16 ` rguenth at gcc dot gnu.org
2022-11-25 13:19 ` already5chosen at yahoo dot com
2022-11-25 20:46 ` rguenth at gcc dot gnu.org
2022-11-25 21:27 ` amonakov at gcc dot gnu.org
2022-11-26 18:27 ` already5chosen at yahoo dot com
2022-11-26 18:36 ` already5chosen at yahoo dot com
2022-11-26 19:36 ` amonakov at gcc dot gnu.org
2022-11-26 22:00 ` already5chosen at yahoo dot com
2022-11-28 6:29 ` crazylht at gmail dot com
2022-11-28 6:42 ` crazylht at gmail dot com
2022-11-28 7:21 ` rguenther at suse dot de
2022-11-28 7:24 ` crazylht at gmail dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).