public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512
@ 2023-01-14 20:55 hubicka at gcc dot gnu.org
2023-01-16 8:07 ` [Bug middle-end/108410] " rguenth at gcc dot gnu.org
` (11 more replies)
0 siblings, 12 replies; 13+ messages in thread
From: hubicka at gcc dot gnu.org @ 2023-01-14 20:55 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
Bug ID: 108410
Summary: x264 averaging loop not optimized well for avx512
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
x264 benchmark has a loop averaging two unsigned char arrays that is executed
with relatively low trip counts that does not play well with our vectorized
code. For AVX512 most time is spent in unvectorized variant since the average
number of iterations is too small to reach the vector code.
This table shows runtimes of averaging given block size with scalar loop,
vectorized loop for individual vector sizes and aocc codegen:
size scalar 128 256 512 aocc
2 8.13 9.49 9.49 9.49 9.49
4 5.79 6.10 6.10 7.45 6.78
6 5.44 5.43 5.42 6.78 5.87
8 5.19 2.71 5.31 6.44 5.42
12 5.14 3.17 5.33 6.10 4.97
16 4.85 1.19 1.53 5.93 1.36
20 4.82 2.03 1.90 6.10 1.90
24 4.60 0.96 2.58 6.10 2.26
28 4.51 1.55 2.97 6.00 2.55
32 4.52 0.68 0.60 0.60 0.77
34 4.77 0.96 0.88 0.80 0.96
38 4.42 1.36 1.37 1.17 1.29
42 4.40 0.84 1.82 1.73 1.63
So for sizes 2-8 scalar loop wins.
For sizes 12-16 128bit vectorization wins, 20-28 behaves funily.
However avx512 vectorization is a huge loss for all sizes up to 31 bytes.
aocc seems to win for 16 bytes.
Note that one problem is that for 256bit vector we peel the epilogue loop
(since trip counts fits in max-completely-peeled-insns and
max-completely-peel-times. Bumping both twice makes avx512 prologue unrolled
too but it does not seem to help x264 benchmark itself.
bmk.c:
#include <stdlib.h>
unsigned char a[10000];
unsigned char b[10000];
unsigned char c[10000];
__attribute__ ((weak))
void
avg (unsigned char *a, unsigned char *b, unsigned char *c, int size)
{
for (int i = 0; i <size; i++)
{
a[i] = (b[i] + c[i] + 1) >> 1;
}
}
int
main(int argc, char**argv)
{
int size = atoi (argv[1]);
for (long i = 0 ; i < 10000000000/size; i++)
{
avg (a,b,c,size);
}
return 0;
}
#include <stdlib.h>
unsigned char a[10000];
unsigned char b[10000];
unsigned char c[10000];
__attribute__ ((weak))
void
avg (unsigned char *a, unsigned char *b, unsigned char *c, int size)
{
for (int i = 0; i <size; i++)
{
a[i] = (b[i] + c[i] + 1) >> 1;
}
}
int
main(int argc, char**argv)
{
int size = atoi (argv[1]);
for (long i = 0 ; i < 10000000000/size; i++)
{
avg (a,b,c,size);
}
return 0;
}
bmk.sh:
gcc -Ofast -march=native bmk.c -fno-tree-vectorize -o bmk.scalar
gcc -Ofast -march=native bmk.c -mprefer-vector-width=128 -o bmk.128
gcc -Ofast -march=native bmk.c -mprefer-vector-width=256 -o bmk.256
gcc -Ofast -march=native bmk.c -mprefer-vector-width=512 -o bmk.512
~/aocc-compiler-4.0.0//bin/clang -Ofast -march=native bmk.c -o bmk.aocc
echo "size scalar 128 256 512 aocc"
for size in 2 4 6 8 12 16 20 24 28 32 34 38 42
do
scalar=`time -f "%e" ./bmk.scalar $size 2>&1`
v128=`time -f "%e" ./bmk.128 $size 2>&1`
v256=`time -f "%e" ./bmk.256 $size 2>&1`
v512=`time -f "%e" ./bmk.512 $size 2>&1`
aocc=`time -f "%e" ./bmk.aocc $size 2>&1`
printf "%5i %7.2f %7.2f %7.2f %7.2f %7.2f\n" $size $scalar $v128 $v256 $v512
$aocc
done
aocc codegen:
# %bb.0: # %entry
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
testl %ecx, %ecx
jle .LBB0_15
# %bb.1: # %iter.check
movl %ecx, %r8d
cmpl $16, %ecx
jae .LBB0_3
# %bb.2:
xorl %eax, %eax
jmp .LBB0_14
.LBB0_3: # %vector.memcheck
leaq (%rsi,%r8), %r9
leaq (%rdi,%r8), %rax
leaq (%rdx,%r8), %r10
cmpq %rdi, %r9
seta %r11b
cmpq %rsi, %rax
seta %bl
cmpq %rdi, %r10
seta %r9b
cmpq %rdx, %rax
seta %r10b
xorl %eax, %eax
testb %bl, %r11b
jne .LBB0_14
# %bb.4: # %vector.memcheck
andb %r10b, %r9b
jne .LBB0_14
# %bb.5: # %vector.main.loop.iter.check
cmpl $128, %ecx
jae .LBB0_7
# %bb.6:
xorl %eax, %eax
jmp .LBB0_11
.LBB0_7: # %vector.ph
movl %r8d, %eax
andl $-128, %eax
xorl %ecx, %ecx
.p2align 4, 0x90
.LBB0_8: # %vector.body
# =>This Inner Loop Header: Depth=1
vmovdqu (%rdx,%rcx), %ymm0
vmovdqu 32(%rdx,%rcx), %ymm1
vmovdqu 64(%rdx,%rcx), %ymm2
vmovdqu 96(%rdx,%rcx), %ymm3
vpavgb (%rsi,%rcx), %ymm0, %ymm0
vpavgb 32(%rsi,%rcx), %ymm1, %ymm1
vpavgb 64(%rsi,%rcx), %ymm2, %ymm2
vpavgb 96(%rsi,%rcx), %ymm3, %ymm3
vmovdqu %ymm0, (%rdi,%rcx)
vmovdqu %ymm1, 32(%rdi,%rcx)
vmovdqu %ymm2, 64(%rdi,%rcx)
vmovdqu %ymm3, 96(%rdi,%rcx)
subq $-128, %rcx
cmpq %rcx, %rax
jne .LBB0_8
# %bb.9: # %middle.block
cmpq %r8, %rax
je .LBB0_15
# %bb.10: # %vec.epilog.iter.check
testb $112, %r8b
je .LBB0_14
.LBB0_11: # %vec.epilog.ph
movq %rax, %rcx
movl %r8d, %eax
andl $-16, %eax
.p2align 4, 0x90
.LBB0_12: # %vec.epilog.vector.body
# =>This Inner Loop Header: Depth=1
vmovdqu (%rdx,%rcx), %xmm0
vpavgb (%rsi,%rcx), %xmm0, %xmm0
vmovdqu %xmm0, (%rdi,%rcx)
addq $16, %rcx
cmpq %rcx, %rax
jne .LBB0_12
# %bb.13: # %vec.epilog.middle.block
cmpq %r8, %rax
je .LBB0_15
.p2align 4, 0x90
.LBB0_14: # %for.body
# =>This Inner Loop Header: Depth=1
movzbl (%rsi,%rax), %ecx
movzbl (%rdx,%rax), %ebx
leal 1(%rcx,%rbx), %ecx
shrl %ecx
movb %cl, (%rdi,%rax)
incq %rax
cmpq %rax, %r8
jne .LBB0_14
.LBB0_15: # %for.cond.cleanup
popq %rbx
.cfi_def_cfa_offset 8
vzeroupper
retq
trunk does generate the following.
Prologue can be simplified (i.e. cml $30, %eax replaced by $cmp $31, %rcx)
and there is 256 move at L4 just not used for small block sizes becaue of the
prologue check.
avg:
.LFB11:
.cfi_startproc
movq %rdx, %r8
movl %ecx, %edx
testl %ecx, %ecx
jle .L27
leal -1(%rcx), %eax
movl %ecx, %r9d
cmpl $30, %eax
jbe .L3
leaq 1(%rsi), %r10
movq %rdi, %rcx
subq %r10, %rcx
cmpq $62, %rcx
jbe .L3
leaq 1(%r8), %r10
movq %rdi, %rcx
subq %r10, %rcx
cmpq $62, %rcx
jbe .L3
cmpl $62, %eax
jbe .L12
movl %edx, %ecx
xorl %eax, %eax
shrl $6, %ecx
salq $6, %rcx
.p2align 4
.p2align 3
.L5:
vmovdqu8 (%rsi,%rax), %zmm1
vpavgb (%r8,%rax), %zmm1, %zmm0
vmovdqu8 %zmm0, (%rdi,%rax)
addq $64, %rax
cmpq %rax, %rcx
jne .L5
movl %edx, %eax
andl $-64, %eax
movl %eax, %ecx
cmpl %eax, %edx
je .L26
movl %edx, %r9d
subl %eax, %r9d
leal -1(%r9), %r10d
cmpl $30, %r10d
jbe .L7
.L4:
vmovdqu8 (%rsi,%rcx), %ymm2
vpavgb (%r8,%rcx), %ymm2, %ymm0
vmovdqu8 %ymm0, (%rdi,%rcx)
movl %r9d, %ecx
andl $-32, %ecx
addl %ecx, %eax
andl $31, %r9d
je .L26
.L7:
cltq
.p2align 4
.p2align 3
.L9:
movzbl (%rsi,%rax), %r9d
movzbl (%r8,%rax), %ecx
leal 1(%r9,%rcx), %ecx
sarl %ecx
movb %cl, (%rdi,%rax)
incq %rax
cmpl %eax, %edx
jg .L9
.L26:
vzeroupper
.L27:
ret
.p2align 4
.p2align 3
.L3:
movslq %edx, %rcx
xorl %eax, %eax
.p2align 4
.p2align 3
.L10:
movzbl (%rsi,%rax), %r9d
movzbl (%r8,%rax), %edx
leal 1(%r9,%rdx), %edx
sarl %edx
movb %dl, (%rdi,%rax)
incq %rax
cmpq %rcx, %rax
jne .L10
ret
.L12:
xorl %ecx, %ecx
xorl %eax, %eax
jmp .L4
.cfi_endproc
.LFE11:
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
@ 2023-01-16 8:07 ` rguenth at gcc dot gnu.org
2023-01-18 12:33 ` rguenth at gcc dot gnu.org
` (10 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-01-16 8:07 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Blocks| |53947
Last reconfirmed| |2023-01-16
Target| |x86_64-*-*
Keywords| |missed-optimization
CC| |rguenth at gcc dot gnu.org
Ever confirmed|0 |1
Status|UNCONFIRMED |NEW
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
One issue is that we at most perform one epilogue loop vectorization, so with
AVX512 we vectorize the epilogue with AVX2 but its epilogue remains
unvectorized. With AVX512 we'd want to use a fully masked epilogue using
AVX512 instead.
I started working on fully masked vectorization support for AVX512 but
got distracted.
Another option would be to use SSE vectorization for the epilogue
(note for SSE we vectorize the epilogue with 64bit half-SSE vectors!),
which would mean giving the target (some) control over the mode used
for vectorizing the epilogue. That is, in vect_analyze_loop change
/* For epilogues start the analysis from the first mode. The motivation
behind starting from the beginning comes from cases where the VECTOR_MODES
array may contain length-agnostic and length-specific modes. Their
ordering is not guaranteed, so we could end up picking a mode for the main
loop that is after the epilogue's optimal mode. */
vector_modes[0] = autodetected_vector_mode;
to go through a target hook (possibly first produce a "candidate mode" set
and allow the target to prune that). This might be an "easy" fix for the
AVX512 issue for low-trip loops.
Referenced Bugs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
2023-01-16 8:07 ` [Bug middle-end/108410] " rguenth at gcc dot gnu.org
@ 2023-01-18 12:33 ` rguenth at gcc dot gnu.org
2023-01-18 12:46 ` rguenth at gcc dot gnu.org
` (9 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-01-18 12:33 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
The naiive masked epilogue (--param vect-partial-vector-usage=1 and support
for whilesiult as in a prototype I have) then looks like
leal -1(%rdx), %eax
cmpl $62, %eax
jbe .L11
.L11:
xorl %ecx, %ecx
jmp .L4
.L4:
movl %ecx, %eax
subl %ecx, %edx
addq %rax, %rsi
addq %rax, %rdi
addq %r8, %rax
cmpl $64, %edx
jl .L8
kxorq %k1, %k1, %k1
kxnorq %k1, %k1, %k1
.L7:
vmovdqu8 (%rsi), %zmm0{%k1}{z}
vmovdqu8 (%rdi), %zmm1{%k1}{z}
vpavgb %zmm1, %zmm0, %zmm0
vmovdqu8 %zmm0, (%rax){%k1}
.L21:
vzeroupper
ret
.L8:
vmovdqa64 .LC0(%rip), %zmm1
vpbroadcastb %edx, %zmm0
vpcmpb $1, %zmm0, %zmm1, %k1
jmp .L7
RTL isn't good at jump threading the mess caused by my ad-hoc whileult
RTL expansion - representing this at a higher level is probably the way
to go. What you'd basically should get is for the epilogue (also used
when the main vectorized loop isn't entered):
vmovdqa64 .LC0(%rip), %zmm1
vpbroadcastb %edx, %zmm0
vpcmpb $1, %zmm0, %zmm1, %k1
vmovdqu8 (%rsi), %zmm0{%k1}{z}
vmovdqu8 (%rdi), %zmm1{%k1}{z}
vpavgb %zmm1, %zmm0, %zmm0
vmovdqu8 %zmm0, (%rax){%k1}
that is a compare of a vector with { niter, niter, ... } with { 0, 1,2 3, .. }
producing the mask (that has a latency of 3 according to agner) and then
simply the vectorized code masked. You can probably assembly code that
if you'd be interested in the (optimal) performance outcome.
For now we probably want to have the main loop traditionally vectorized
without masking because Intel has poor mask support and AMD has bad
latency on the mask producing compares. But having a masked vectorized
epilog avoids the need for a scalar epilog, saving code-size, and
avoids the need to vectorize that multiple times (or choosing SSE vectors
here). For Zen4 the above will of course utilize two 512bit op halves
even when one is fully masked (well, I suppose at least that this is the case).
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
2023-01-16 8:07 ` [Bug middle-end/108410] " rguenth at gcc dot gnu.org
2023-01-18 12:33 ` rguenth at gcc dot gnu.org
@ 2023-01-18 12:46 ` rguenth at gcc dot gnu.org
2023-06-07 12:22 ` rguenth at gcc dot gnu.org
` (8 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-01-18 12:46 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
the naiive "bad" code-gen produces
size 512-masked
2 12.19
4 6.09
6 4.06
8 3.04
12 2.03
14 1.52
16 1.21
20 1.01
24 0.87
32 0.76
34 0.71
38 0.64
42 0.58
on alberti (you seem to have used the same machine). So the AVX512 "stupid"
code-gen is faster for 6+ elements and I guess optimizing it should then
outperform scalar also for 4 elements. The exact matches for 8 on 128
and 16 on 256 are hard to beat of course, likewise the single or two iteration
case.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (2 preceding siblings ...)
2023-01-18 12:46 ` rguenth at gcc dot gnu.org
@ 2023-06-07 12:22 ` rguenth at gcc dot gnu.org
2023-06-09 12:11 ` rguenth at gcc dot gnu.org
` (7 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-06-07 12:22 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
Adding fully masked AVX512 and AVX512 with a masked epilog data:
size scalar 128 256 512 512e 512f
1 9.42 11.32 9.35 11.17 15.13 16.89
2 5.72 6.53 6.66 6.66 7.62 8.56
3 4.49 5.10 5.10 5.74 5.08 5.73
4 4.10 4.33 4.29 5.21 3.79 4.25
6 3.78 3.85 3.86 4.76 2.54 2.85
8 3.64 1.89 3.76 4.50 1.92 2.16
12 3.56 2.21 3.75 4.26 1.26 1.42
16 3.36 0.83 1.06 4.16 0.95 1.07
20 3.39 1.42 1.33 4.07 0.75 0.85
24 3.23 0.66 1.72 4.22 0.62 0.70
28 3.18 1.09 2.04 4.20 0.54 0.61
32 3.16 0.47 0.41 0.41 0.47 0.53
34 3.16 0.67 0.61 0.56 0.44 0.50
38 3.19 0.95 0.95 0.82 0.40 0.45
42 3.09 0.58 1.21 1.13 0.36 0.40
text sizes are not much different:
1389 1837 2125 1629 1721 1689
the AVX2 size is large because we completely peel the scalar epilogue,
same for the SSE case. The scalar epilogue of the 512 loop iterates
32 times (too many for peeling), the masked loop/epilogue are quite
large due to the EVEX encoded instructions so the saved scalar/vector
epilogues do not show.
The AVX512 masked epilogue case now looks like:
.p2align 3
.L5:
vmovdqu8 (%r8,%rax), %zmm0
vpavgb (%rsi,%rax), %zmm0, %zmm0
vmovdqu8 %zmm0, (%rdi,%rax)
addq $64, %rax
cmpq %rcx, %rax
jne .L5
movl %edx, %ecx
andl $-64, %ecx
testb $63, %dl
je .L19
.L4:
movl %ecx, %eax
subl %ecx, %edx
movl $255, %ecx
cmpl %ecx, %edx
cmova %ecx, %edx
vpbroadcastb %edx, %zmm0
vpcmpub $6, .LC0(%rip), %zmm0, %k1
vmovdqu8 (%rsi,%rax), %zmm0{%k1}{z}
vmovdqu8 (%r8,%rax), %zmm1{%k1}{z}
vpavgb %zmm1, %zmm0, %zmm0
vmovdqu8 %zmm0, (%rdi,%rax){%k1}
.L19:
vzeroupper
ret
where there's a missed optimization around the saturation to 255.
The fully masked AVX512 loop is
vmovdqa64 .LC0(%rip), %zmm3
movl $255, %eax
cmpl %eax, %ecx
cmovbe %ecx, %eax
vpbroadcastb %eax, %zmm0
vpcmpub $6, %zmm3, %zmm0, %k1
.p2align 4
.p2align 3
.L4:
vmovdqu8 (%rsi,%rax), %zmm1{%k1}
vmovdqu8 (%r8,%rax), %zmm2{%k1}
movl %r10d, %edx
movl $255, %ecx
subl %eax, %edx
cmpl %ecx, %edx
cmova %ecx, %edx
vpavgb %zmm2, %zmm1, %zmm0
vmovdqu8 %zmm0, (%rdi,%rax){%k1}
vpbroadcastb %edx, %zmm0
addq $64, %rax
movl %r9d, %edx
subl %eax, %edx
vpcmpub $6, %zmm3, %zmm0, %k1
cmpl $64, %edx
ja .L4
vzeroupper
ret
which is a much larger loop body due to the mask creation. At least
that interleaves nicely (dependence wise) with the loop control and
vectorized stmts. What needs to be optimized somehow is what IVOPTs
makes out of the decreasing remaining scalar iters IV with the
IV required for the memory accesses. Without IVOPTs the body looks
like
.L4:
vmovdqu8 (%rsi), %zmm1{%k1}
vmovdqu8 (%rdx), %zmm2{%k1}
movl $255, %eax
movl %ecx, %r8d
subl $64, %ecx
addq $64, %rsi
addq $64, %rdx
vpavgb %zmm2, %zmm1, %zmm0
vmovdqu8 %zmm0, (%rdi){%k1}
addq $64, %rdi
cmpl %eax, %ecx
cmovbe %ecx, %eax
vpbroadcastb %eax, %zmm0
vpcmpub $6, %zmm3, %zmm0, %k1
cmpl $64, %r8d
ja .L4
and the key thing to optimize is
ivtmp_78 = ivtmp_77 + 4294967232; // -64
_79 = MIN_EXPR <ivtmp_78, 255>;
_80 = (unsigned char) _79;
_81 = {_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
_80, _80};
that is we want to broadcast a saturated (to vector element precision) value.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (3 preceding siblings ...)
2023-06-07 12:22 ` rguenth at gcc dot gnu.org
@ 2023-06-09 12:11 ` rguenth at gcc dot gnu.org
2023-06-12 5:48 ` crazylht at gmail dot com
` (6 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-06-09 12:11 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
Btw, for the case we can use the same mask compare type as we use as type for
the IV (so we know we can represent all required values) we can elide the
saturation. So for example
void foo (double * __restrict a, double *b, double *c, int n)
{
for (int i = 0; i < n; ++i)
a[i] = b[i] + c[i];
}
can produce
testl %ecx, %ecx
jle .L5
vmovdqa .LC0(%rip), %ymm3
vpbroadcastd %ecx, %ymm2
xorl %eax, %eax
subl $8, %ecx
vpcmpud $6, %ymm3, %ymm2, %k1
.p2align 4
.p2align 3
.L3:
vmovupd (%rsi,%rax), %zmm1{%k1}
vmovupd (%rdx,%rax), %zmm0{%k1}
movl %ecx, %r8d
vaddpd %zmm1, %zmm0, %zmm2{%k1}{z}
addl $8, %r8d
vmovupd %zmm2, (%rdi,%rax){%k1}
vpbroadcastd %ecx, %ymm2
addq $64, %rax
subl $8, %ecx
vpcmpud $6, %ymm3, %ymm2, %k1
cmpl $8, %r8d
ja .L3
vzeroupper
.L5:
ret
That should work as long as the data size is larger or matches the IV size
which is hopefully the case for all FP testcases. The trick is going to be
to make this visible to costing - I'm not sure we get to decide whether
to use masking or not when we do not want to decide between vector sizes
(the x86 backend picks the first successful one). For SVE it's either
masking (with SVE modes) or not masking (with NEON modes) so it's
decided based on mode rather than as additional knob.
Performance-wise the above is likely still slower than not using masking
plus a masked epilog but it would actually save on code-size for -Os
or -O2. Of course for code-size we might want to stick to SSE/AVX
for the smaller encoding.
Note we have to watch out for all-zero masks for masked stores since
that's very slow (for a reason unknown to me), when we have a stmt
split to multiple vector stmts it's not uncommon (esp. for the epilog)
to have one of them with an all-zero bit mask. For the loop case and
.MASK_STORE we emit branchy code for this but we might want to avoid
the situation by costing (and not using a masked loop/epilog in that
case).
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (4 preceding siblings ...)
2023-06-09 12:11 ` rguenth at gcc dot gnu.org
@ 2023-06-12 5:48 ` crazylht at gmail dot com
2023-06-12 8:06 ` rguenther at suse dot de
` (5 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: crazylht at gmail dot com @ 2023-06-12 5:48 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #6 from Hongtao.liu <crazylht at gmail dot com> ---
> and the key thing to optimize is
>
> ivtmp_78 = ivtmp_77 + 4294967232; // -64
> _79 = MIN_EXPR <ivtmp_78, 255>;
> _80 = (unsigned char) _79;
> _81 = {_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
> _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
> _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
> _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
> _80, _80, _80, _80, _80, _80};
>
> that is we want to broadcast a saturated (to vector element precision) value.
Yes, backend needs to support vec_pack_ssat_m, vec_pack_usat_m.
But I didn't find optab for ss_truncate or us_truncate which might be used by
BB vectorizer.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (5 preceding siblings ...)
2023-06-12 5:48 ` crazylht at gmail dot com
@ 2023-06-12 8:06 ` rguenther at suse dot de
2023-06-13 3:45 ` crazylht at gmail dot com
` (4 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: rguenther at suse dot de @ 2023-06-12 8:06 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #7 from rguenther at suse dot de <rguenther at suse dot de> ---
On Mon, 12 Jun 2023, crazylht at gmail dot com wrote:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
>
> --- Comment #6 from Hongtao.liu <crazylht at gmail dot com> ---
>
> > and the key thing to optimize is
> >
> > ivtmp_78 = ivtmp_77 + 4294967232; // -64
> > _79 = MIN_EXPR <ivtmp_78, 255>;
> > _80 = (unsigned char) _79;
> > _81 = {_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
> > _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
> > _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
> > _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
> > _80, _80, _80, _80, _80, _80};
> >
> > that is we want to broadcast a saturated (to vector element precision) value.
>
> Yes, backend needs to support vec_pack_ssat_m, vec_pack_usat_m.
Can x86 do this? We'd want to apply this to a scalar, so move ivtmp
to xmm, apply pack_usat or as you say below, the non-existing us_trunc
and then broadcast.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (6 preceding siblings ...)
2023-06-12 8:06 ` rguenther at suse dot de
@ 2023-06-13 3:45 ` crazylht at gmail dot com
2023-06-13 8:05 ` rguenther at suse dot de
` (3 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: crazylht at gmail dot com @ 2023-06-13 3:45 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #8 from Hongtao.liu <crazylht at gmail dot com> ---
> Can x86 do this? We'd want to apply this to a scalar, so move ivtmp
> to xmm, apply pack_usat or as you say below, the non-existing us_trunc
> and then broadcast.
I see, we don't have scalar version. Also vector instruction looks not very
fast.
https://uops.info/html-instr/VPMOVSDB_XMM_XMM.html
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (7 preceding siblings ...)
2023-06-13 3:45 ` crazylht at gmail dot com
@ 2023-06-13 8:05 ` rguenther at suse dot de
2023-06-14 12:54 ` rguenth at gcc dot gnu.org
` (2 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: rguenther at suse dot de @ 2023-06-13 8:05 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #9 from rguenther at suse dot de <rguenther at suse dot de> ---
On Tue, 13 Jun 2023, crazylht at gmail dot com wrote:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
>
> --- Comment #8 from Hongtao.liu <crazylht at gmail dot com> ---
>
> > Can x86 do this? We'd want to apply this to a scalar, so move ivtmp
> > to xmm, apply pack_usat or as you say below, the non-existing us_trunc
> > and then broadcast.
>
> I see, we don't have scalar version. Also vector instruction looks not very
> fast.
>
> https://uops.info/html-instr/VPMOVSDB_XMM_XMM.html
Uh, yeah. Well, Zen4 looks reasonable though latency could be better.
Preliminary performance data also shows masked epilogues are a
mixed bag. I'll finish off the implementation and then we'll see
if we can selectively enable it for the profitable cases somehow.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (8 preceding siblings ...)
2023-06-13 8:05 ` rguenther at suse dot de
@ 2023-06-14 12:54 ` rguenth at gcc dot gnu.org
2024-02-09 13:53 ` rguenth at gcc dot gnu.org
2024-04-15 13:29 ` rguenth at gcc dot gnu.org
11 siblings, 0 replies; 13+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-06-14 12:54 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Assignee|unassigned at gcc dot gnu.org |rguenth at gcc dot gnu.org
Status|NEW |ASSIGNED
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (9 preceding siblings ...)
2023-06-14 12:54 ` rguenth at gcc dot gnu.org
@ 2024-02-09 13:53 ` rguenth at gcc dot gnu.org
2024-04-15 13:29 ` rguenth at gcc dot gnu.org
11 siblings, 0 replies; 13+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-02-09 13:53 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
--- Comment #10 from Richard Biener <rguenth at gcc dot gnu.org> ---
So this is now fixed if you use --param vect-partial-vector-usage=2, there is
at the moment no way to get masking/not masking costed against each other. In
theory vect_analyze_loop_costing and vect_estimate_min_profitable_iters
could do both and we could delay vect_determine_partial_vectors_and_peeling.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [Bug middle-end/108410] x264 averaging loop not optimized well for avx512
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
` (10 preceding siblings ...)
2024-02-09 13:53 ` rguenth at gcc dot gnu.org
@ 2024-04-15 13:29 ` rguenth at gcc dot gnu.org
11 siblings, 0 replies; 13+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-04-15 13:29 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Assignee|rguenth at gcc dot gnu.org |unassigned at gcc dot gnu.org
Resolution|--- |FIXED
Status|ASSIGNED |RESOLVED
--- Comment #11 from Richard Biener <rguenth at gcc dot gnu.org> ---
I think "fixed" as far as we can get, esp. w/o considering all possible vector
sizes.
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2024-04-15 13:29 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-14 20:55 [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 hubicka at gcc dot gnu.org
2023-01-16 8:07 ` [Bug middle-end/108410] " rguenth at gcc dot gnu.org
2023-01-18 12:33 ` rguenth at gcc dot gnu.org
2023-01-18 12:46 ` rguenth at gcc dot gnu.org
2023-06-07 12:22 ` rguenth at gcc dot gnu.org
2023-06-09 12:11 ` rguenth at gcc dot gnu.org
2023-06-12 5:48 ` crazylht at gmail dot com
2023-06-12 8:06 ` rguenther at suse dot de
2023-06-13 3:45 ` crazylht at gmail dot com
2023-06-13 8:05 ` rguenther at suse dot de
2023-06-14 12:54 ` rguenth at gcc dot gnu.org
2024-02-09 13:53 ` rguenth at gcc dot gnu.org
2024-04-15 13:29 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).