public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org> To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 Date: Sat, 14 Jan 2023 20:55:39 +0000 [thread overview] Message-ID: <bug-108410-4@http.gcc.gnu.org/bugzilla/> (raw) https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410 Bug ID: 108410 Summary: x264 averaging loop not optimized well for avx512 Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- x264 benchmark has a loop averaging two unsigned char arrays that is executed with relatively low trip counts that does not play well with our vectorized code. For AVX512 most time is spent in unvectorized variant since the average number of iterations is too small to reach the vector code. This table shows runtimes of averaging given block size with scalar loop, vectorized loop for individual vector sizes and aocc codegen: size scalar 128 256 512 aocc 2 8.13 9.49 9.49 9.49 9.49 4 5.79 6.10 6.10 7.45 6.78 6 5.44 5.43 5.42 6.78 5.87 8 5.19 2.71 5.31 6.44 5.42 12 5.14 3.17 5.33 6.10 4.97 16 4.85 1.19 1.53 5.93 1.36 20 4.82 2.03 1.90 6.10 1.90 24 4.60 0.96 2.58 6.10 2.26 28 4.51 1.55 2.97 6.00 2.55 32 4.52 0.68 0.60 0.60 0.77 34 4.77 0.96 0.88 0.80 0.96 38 4.42 1.36 1.37 1.17 1.29 42 4.40 0.84 1.82 1.73 1.63 So for sizes 2-8 scalar loop wins. For sizes 12-16 128bit vectorization wins, 20-28 behaves funily. However avx512 vectorization is a huge loss for all sizes up to 31 bytes. aocc seems to win for 16 bytes. Note that one problem is that for 256bit vector we peel the epilogue loop (since trip counts fits in max-completely-peeled-insns and max-completely-peel-times. Bumping both twice makes avx512 prologue unrolled too but it does not seem to help x264 benchmark itself. bmk.c: #include <stdlib.h> unsigned char a[10000]; unsigned char b[10000]; unsigned char c[10000]; __attribute__ ((weak)) void avg (unsigned char *a, unsigned char *b, unsigned char *c, int size) { for (int i = 0; i <size; i++) { a[i] = (b[i] + c[i] + 1) >> 1; } } int main(int argc, char**argv) { int size = atoi (argv[1]); for (long i = 0 ; i < 10000000000/size; i++) { avg (a,b,c,size); } return 0; } #include <stdlib.h> unsigned char a[10000]; unsigned char b[10000]; unsigned char c[10000]; __attribute__ ((weak)) void avg (unsigned char *a, unsigned char *b, unsigned char *c, int size) { for (int i = 0; i <size; i++) { a[i] = (b[i] + c[i] + 1) >> 1; } } int main(int argc, char**argv) { int size = atoi (argv[1]); for (long i = 0 ; i < 10000000000/size; i++) { avg (a,b,c,size); } return 0; } bmk.sh: gcc -Ofast -march=native bmk.c -fno-tree-vectorize -o bmk.scalar gcc -Ofast -march=native bmk.c -mprefer-vector-width=128 -o bmk.128 gcc -Ofast -march=native bmk.c -mprefer-vector-width=256 -o bmk.256 gcc -Ofast -march=native bmk.c -mprefer-vector-width=512 -o bmk.512 ~/aocc-compiler-4.0.0//bin/clang -Ofast -march=native bmk.c -o bmk.aocc echo "size scalar 128 256 512 aocc" for size in 2 4 6 8 12 16 20 24 28 32 34 38 42 do scalar=`time -f "%e" ./bmk.scalar $size 2>&1` v128=`time -f "%e" ./bmk.128 $size 2>&1` v256=`time -f "%e" ./bmk.256 $size 2>&1` v512=`time -f "%e" ./bmk.512 $size 2>&1` aocc=`time -f "%e" ./bmk.aocc $size 2>&1` printf "%5i %7.2f %7.2f %7.2f %7.2f %7.2f\n" $size $scalar $v128 $v256 $v512 $aocc done aocc codegen: # %bb.0: # %entry pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset %rbx, -16 testl %ecx, %ecx jle .LBB0_15 # %bb.1: # %iter.check movl %ecx, %r8d cmpl $16, %ecx jae .LBB0_3 # %bb.2: xorl %eax, %eax jmp .LBB0_14 .LBB0_3: # %vector.memcheck leaq (%rsi,%r8), %r9 leaq (%rdi,%r8), %rax leaq (%rdx,%r8), %r10 cmpq %rdi, %r9 seta %r11b cmpq %rsi, %rax seta %bl cmpq %rdi, %r10 seta %r9b cmpq %rdx, %rax seta %r10b xorl %eax, %eax testb %bl, %r11b jne .LBB0_14 # %bb.4: # %vector.memcheck andb %r10b, %r9b jne .LBB0_14 # %bb.5: # %vector.main.loop.iter.check cmpl $128, %ecx jae .LBB0_7 # %bb.6: xorl %eax, %eax jmp .LBB0_11 .LBB0_7: # %vector.ph movl %r8d, %eax andl $-128, %eax xorl %ecx, %ecx .p2align 4, 0x90 .LBB0_8: # %vector.body # =>This Inner Loop Header: Depth=1 vmovdqu (%rdx,%rcx), %ymm0 vmovdqu 32(%rdx,%rcx), %ymm1 vmovdqu 64(%rdx,%rcx), %ymm2 vmovdqu 96(%rdx,%rcx), %ymm3 vpavgb (%rsi,%rcx), %ymm0, %ymm0 vpavgb 32(%rsi,%rcx), %ymm1, %ymm1 vpavgb 64(%rsi,%rcx), %ymm2, %ymm2 vpavgb 96(%rsi,%rcx), %ymm3, %ymm3 vmovdqu %ymm0, (%rdi,%rcx) vmovdqu %ymm1, 32(%rdi,%rcx) vmovdqu %ymm2, 64(%rdi,%rcx) vmovdqu %ymm3, 96(%rdi,%rcx) subq $-128, %rcx cmpq %rcx, %rax jne .LBB0_8 # %bb.9: # %middle.block cmpq %r8, %rax je .LBB0_15 # %bb.10: # %vec.epilog.iter.check testb $112, %r8b je .LBB0_14 .LBB0_11: # %vec.epilog.ph movq %rax, %rcx movl %r8d, %eax andl $-16, %eax .p2align 4, 0x90 .LBB0_12: # %vec.epilog.vector.body # =>This Inner Loop Header: Depth=1 vmovdqu (%rdx,%rcx), %xmm0 vpavgb (%rsi,%rcx), %xmm0, %xmm0 vmovdqu %xmm0, (%rdi,%rcx) addq $16, %rcx cmpq %rcx, %rax jne .LBB0_12 # %bb.13: # %vec.epilog.middle.block cmpq %r8, %rax je .LBB0_15 .p2align 4, 0x90 .LBB0_14: # %for.body # =>This Inner Loop Header: Depth=1 movzbl (%rsi,%rax), %ecx movzbl (%rdx,%rax), %ebx leal 1(%rcx,%rbx), %ecx shrl %ecx movb %cl, (%rdi,%rax) incq %rax cmpq %rax, %r8 jne .LBB0_14 .LBB0_15: # %for.cond.cleanup popq %rbx .cfi_def_cfa_offset 8 vzeroupper retq trunk does generate the following. Prologue can be simplified (i.e. cml $30, %eax replaced by $cmp $31, %rcx) and there is 256 move at L4 just not used for small block sizes becaue of the prologue check. avg: .LFB11: .cfi_startproc movq %rdx, %r8 movl %ecx, %edx testl %ecx, %ecx jle .L27 leal -1(%rcx), %eax movl %ecx, %r9d cmpl $30, %eax jbe .L3 leaq 1(%rsi), %r10 movq %rdi, %rcx subq %r10, %rcx cmpq $62, %rcx jbe .L3 leaq 1(%r8), %r10 movq %rdi, %rcx subq %r10, %rcx cmpq $62, %rcx jbe .L3 cmpl $62, %eax jbe .L12 movl %edx, %ecx xorl %eax, %eax shrl $6, %ecx salq $6, %rcx .p2align 4 .p2align 3 .L5: vmovdqu8 (%rsi,%rax), %zmm1 vpavgb (%r8,%rax), %zmm1, %zmm0 vmovdqu8 %zmm0, (%rdi,%rax) addq $64, %rax cmpq %rax, %rcx jne .L5 movl %edx, %eax andl $-64, %eax movl %eax, %ecx cmpl %eax, %edx je .L26 movl %edx, %r9d subl %eax, %r9d leal -1(%r9), %r10d cmpl $30, %r10d jbe .L7 .L4: vmovdqu8 (%rsi,%rcx), %ymm2 vpavgb (%r8,%rcx), %ymm2, %ymm0 vmovdqu8 %ymm0, (%rdi,%rcx) movl %r9d, %ecx andl $-32, %ecx addl %ecx, %eax andl $31, %r9d je .L26 .L7: cltq .p2align 4 .p2align 3 .L9: movzbl (%rsi,%rax), %r9d movzbl (%r8,%rax), %ecx leal 1(%r9,%rcx), %ecx sarl %ecx movb %cl, (%rdi,%rax) incq %rax cmpl %eax, %edx jg .L9 .L26: vzeroupper .L27: ret .p2align 4 .p2align 3 .L3: movslq %edx, %rcx xorl %eax, %eax .p2align 4 .p2align 3 .L10: movzbl (%rsi,%rax), %r9d movzbl (%r8,%rax), %edx leal 1(%r9,%rdx), %edx sarl %edx movb %dl, (%rdi,%rax) incq %rax cmpq %rcx, %rax jne .L10 ret .L12: xorl %ecx, %ecx xorl %eax, %eax jmp .L4 .cfi_endproc .LFE11:
next reply other threads:[~2023-01-14 20:55 UTC|newest] Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-01-14 20:55 hubicka at gcc dot gnu.org [this message] 2023-01-16 8:07 ` [Bug middle-end/108410] " rguenth at gcc dot gnu.org 2023-01-18 12:33 ` rguenth at gcc dot gnu.org 2023-01-18 12:46 ` rguenth at gcc dot gnu.org 2023-06-07 12:22 ` rguenth at gcc dot gnu.org 2023-06-09 12:11 ` rguenth at gcc dot gnu.org 2023-06-12 5:48 ` crazylht at gmail dot com 2023-06-12 8:06 ` rguenther at suse dot de 2023-06-13 3:45 ` crazylht at gmail dot com 2023-06-13 8:05 ` rguenther at suse dot de 2023-06-14 12:54 ` rguenth at gcc dot gnu.org 2024-02-09 13:53 ` rguenth at gcc dot gnu.org 2024-04-15 13:29 ` rguenth at gcc dot gnu.org
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=bug-108410-4@http.gcc.gnu.org/bugzilla/ \ --to=gcc-bugzilla@gcc.gnu.org \ --cc=gcc-bugs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).