From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 024893858D1E; Sat, 14 Jan 2023 20:55:41 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 024893858D1E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1673729741; bh=sNiZAsT4V5PtToE2/WsxoMgSt5LMVvLqJ9CtZ9JXs1g=; h=From:To:Subject:Date:From; b=m8Gh9jFFMWey3sHEa26qShQSssnWt3x8IJaYLKl37Km3C9WNwQ+9NrN9xvWGRZPBe Ty1T3HMUCfewOl39aUX+0IZgKRWmqwhqR2VujpDeMc5UZyKhqt0yyledxuM0RC0yEG HMTrhyJu4tC6NxTDdiPsQ8L9ZoN9A9uX9i2qPvJo= From: "hubicka at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512 Date: Sat, 14 Jan 2023 20:55:39 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 13.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: hubicka at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D108410 Bug ID: 108410 Summary: x264 averaging loop not optimized well for avx512 Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- x264 benchmark has a loop averaging two unsigned char arrays that is execut= ed with relatively low trip counts that does not play well with our vectorized code. For AVX512 most time is spent in unvectorized variant since the aver= age number of iterations is too small to reach the vector code. This table shows runtimes of averaging given block size with scalar loop, vectorized loop for individual vector sizes and aocc codegen: size scalar 128 256 512 aocc 2 8.13 9.49 9.49 9.49 9.49 4 5.79 6.10 6.10 7.45 6.78 6 5.44 5.43 5.42 6.78 5.87 8 5.19 2.71 5.31 6.44 5.42 12 5.14 3.17 5.33 6.10 4.97 16 4.85 1.19 1.53 5.93 1.36 20 4.82 2.03 1.90 6.10 1.90 24 4.60 0.96 2.58 6.10 2.26 28 4.51 1.55 2.97 6.00 2.55 32 4.52 0.68 0.60 0.60 0.77 34 4.77 0.96 0.88 0.80 0.96 38 4.42 1.36 1.37 1.17 1.29 42 4.40 0.84 1.82 1.73 1.63 So for sizes 2-8 scalar loop wins. For sizes 12-16 128bit vectorization wins, 20-28 behaves funily. However avx512 vectorization is a huge loss for all sizes up to 31 bytes. aocc seems to win for 16 bytes. Note that one problem is that for 256bit vector we peel the epilogue loop (since trip counts fits in max-completely-peeled-insns and max-completely-peel-times. Bumping both twice makes avx512 prologue unrolled too but it does not seem to help x264 benchmark itself. bmk.c: #include unsigned char a[10000]; unsigned char b[10000]; unsigned char c[10000]; __attribute__ ((weak)) void avg (unsigned char *a, unsigned char *b, unsigned char *c, int size) { for (int i =3D 0; i > 1; } } int main(int argc, char**argv) { int size =3D atoi (argv[1]); for (long i =3D 0 ; i < 10000000000/size; i++) { avg (a,b,c,size); } return 0; } #include unsigned char a[10000]; unsigned char b[10000]; unsigned char c[10000]; __attribute__ ((weak)) void avg (unsigned char *a, unsigned char *b, unsigned char *c, int size) { for (int i =3D 0; i > 1; } } int main(int argc, char**argv) { int size =3D atoi (argv[1]); for (long i =3D 0 ; i < 10000000000/size; i++) { avg (a,b,c,size); } return 0; } bmk.sh: gcc -Ofast -march=3Dnative bmk.c -fno-tree-vectorize -o bmk.scalar gcc -Ofast -march=3Dnative bmk.c -mprefer-vector-width=3D128 -o bmk.128 gcc -Ofast -march=3Dnative bmk.c -mprefer-vector-width=3D256 -o bmk.256 gcc -Ofast -march=3Dnative bmk.c -mprefer-vector-width=3D512 -o bmk.512 ~/aocc-compiler-4.0.0//bin/clang -Ofast -march=3Dnative bmk.c -o bmk.aocc echo "size scalar 128 256 512 aocc" for size in 2 4 6 8 12 16 20 24 28 32 34 38 42 do scalar=3D`time -f "%e" ./bmk.scalar $size 2>&1` v128=3D`time -f "%e" ./bmk.128 $size 2>&1` v256=3D`time -f "%e" ./bmk.256 $size 2>&1` v512=3D`time -f "%e" ./bmk.512 $size 2>&1` aocc=3D`time -f "%e" ./bmk.aocc $size 2>&1` printf "%5i %7.2f %7.2f %7.2f %7.2f %7.2f\n" $size $scalar $v128 $v256 $v= 512 $aocc done aocc codegen: # %bb.0: # %entry pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset %rbx, -16 testl %ecx, %ecx jle .LBB0_15 # %bb.1: # %iter.check movl %ecx, %r8d cmpl $16, %ecx jae .LBB0_3 # %bb.2: xorl %eax, %eax jmp .LBB0_14 .LBB0_3: # %vector.memcheck leaq (%rsi,%r8), %r9 leaq (%rdi,%r8), %rax leaq (%rdx,%r8), %r10 cmpq %rdi, %r9 seta %r11b cmpq %rsi, %rax seta %bl cmpq %rdi, %r10 seta %r9b cmpq %rdx, %rax seta %r10b xorl %eax, %eax testb %bl, %r11b jne .LBB0_14 # %bb.4: # %vector.memcheck andb %r10b, %r9b jne .LBB0_14 # %bb.5: # %vector.main.loop.iter.check cmpl $128, %ecx jae .LBB0_7 # %bb.6: xorl %eax, %eax jmp .LBB0_11 .LBB0_7: # %vector.ph movl %r8d, %eax andl $-128, %eax xorl %ecx, %ecx .p2align 4, 0x90 .LBB0_8: # %vector.body # =3D>This Inner Loop Header: Depth= =3D1 vmovdqu (%rdx,%rcx), %ymm0 vmovdqu 32(%rdx,%rcx), %ymm1 vmovdqu 64(%rdx,%rcx), %ymm2 vmovdqu 96(%rdx,%rcx), %ymm3 vpavgb (%rsi,%rcx), %ymm0, %ymm0 vpavgb 32(%rsi,%rcx), %ymm1, %ymm1 vpavgb 64(%rsi,%rcx), %ymm2, %ymm2 vpavgb 96(%rsi,%rcx), %ymm3, %ymm3 vmovdqu %ymm0, (%rdi,%rcx) vmovdqu %ymm1, 32(%rdi,%rcx) vmovdqu %ymm2, 64(%rdi,%rcx) vmovdqu %ymm3, 96(%rdi,%rcx) subq $-128, %rcx cmpq %rcx, %rax jne .LBB0_8 # %bb.9: # %middle.block cmpq %r8, %rax je .LBB0_15 # %bb.10: # %vec.epilog.iter.check testb $112, %r8b je .LBB0_14 .LBB0_11: # %vec.epilog.ph movq %rax, %rcx movl %r8d, %eax andl $-16, %eax .p2align 4, 0x90 .LBB0_12: # %vec.epilog.vector.body # =3D>This Inner Loop Header: Depth= =3D1 vmovdqu (%rdx,%rcx), %xmm0 vpavgb (%rsi,%rcx), %xmm0, %xmm0 vmovdqu %xmm0, (%rdi,%rcx) addq $16, %rcx cmpq %rcx, %rax jne .LBB0_12 # %bb.13: # %vec.epilog.middle.block cmpq %r8, %rax je .LBB0_15 .p2align 4, 0x90 .LBB0_14: # %for.body # =3D>This Inner Loop Header: Depth= =3D1 movzbl (%rsi,%rax), %ecx movzbl (%rdx,%rax), %ebx leal 1(%rcx,%rbx), %ecx shrl %ecx movb %cl, (%rdi,%rax) incq %rax cmpq %rax, %r8 jne .LBB0_14 .LBB0_15: # %for.cond.cleanup popq %rbx .cfi_def_cfa_offset 8 vzeroupper retq trunk does generate the following. Prologue can be simplified (i.e. cml $30, %eax replaced by $cmp $31, %rcx) and there is 256 move at L4 just not used for small block sizes becaue of t= he prologue check. avg: .LFB11: .cfi_startproc movq %rdx, %r8 movl %ecx, %edx testl %ecx, %ecx jle .L27 leal -1(%rcx), %eax movl %ecx, %r9d cmpl $30, %eax jbe .L3 leaq 1(%rsi), %r10 movq %rdi, %rcx subq %r10, %rcx cmpq $62, %rcx jbe .L3 leaq 1(%r8), %r10 movq %rdi, %rcx subq %r10, %rcx cmpq $62, %rcx jbe .L3 cmpl $62, %eax jbe .L12 movl %edx, %ecx xorl %eax, %eax shrl $6, %ecx salq $6, %rcx .p2align 4 .p2align 3 .L5: vmovdqu8 (%rsi,%rax), %zmm1 vpavgb (%r8,%rax), %zmm1, %zmm0 vmovdqu8 %zmm0, (%rdi,%rax) addq $64, %rax cmpq %rax, %rcx jne .L5 movl %edx, %eax andl $-64, %eax movl %eax, %ecx cmpl %eax, %edx je .L26 movl %edx, %r9d subl %eax, %r9d leal -1(%r9), %r10d cmpl $30, %r10d jbe .L7 .L4: vmovdqu8 (%rsi,%rcx), %ymm2 vpavgb (%r8,%rcx), %ymm2, %ymm0 vmovdqu8 %ymm0, (%rdi,%rcx) movl %r9d, %ecx andl $-32, %ecx addl %ecx, %eax andl $31, %r9d je .L26 .L7: cltq .p2align 4 .p2align 3 .L9: movzbl (%rsi,%rax), %r9d movzbl (%r8,%rax), %ecx leal 1(%r9,%rcx), %ecx sarl %ecx movb %cl, (%rdi,%rax) incq %rax cmpl %eax, %edx jg .L9 .L26: vzeroupper .L27: ret .p2align 4 .p2align 3 .L3: movslq %edx, %rcx xorl %eax, %eax .p2align 4 .p2align 3 .L10: movzbl (%rsi,%rax), %r9d movzbl (%r8,%rax), %edx leal 1(%r9,%rdx), %edx sarl %edx movb %dl, (%rdi,%rax) incq %rax cmpq %rcx, %rax jne .L10 ret .L12: xorl %ecx, %ecx xorl %eax, %eax jmp .L4 .cfi_endproc .LFE11:=