From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 74A293857B98; Wed,  7 Jun 2023 12:22:56 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 74A293857B98
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1686140576;
	bh=h6ZsNcSKgGA5jbRo/uNG/tn63s1AOc/2H3ihBAzZiM0=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=yY6LdAxHwUY1ld1Ow9N0d5Djb1u2kzTgKV22Dm3mnr5YLNJUVqGkmpxKIi1sQgSEk
	 pSkrabWS6ofYdKgN+zkyuYl0QFUGpj2Hu5x8zs7w3Q3t4b0x28pd0zjrLfI1hiRr3X
	 jSdwT559fdv5frvPJw7+R4Do9zv8HMY0Ni5nUg94=
From: "rguenth at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug middle-end/108410] x264 averaging loop not optimized well for
 avx512
Date: Wed, 07 Jun 2023 12:22:54 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: middle-end
X-Bugzilla-Version: 13.0
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: rguenth at gcc dot gnu.org
X-Bugzilla-Status: NEW
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-108410-4-a19zmWwxJX@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-108410-4@http.gcc.gnu.org/bugzilla/>
References: <bug-108410-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D108410
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
Adding fully masked AVX512 and AVX512 with a masked epilog data:

size   scalar     128     256     512    512e    512f
    1    9.42   11.32    9.35   11.17   15.13   16.89
    2    5.72    6.53    6.66    6.66    7.62    8.56
    3    4.49    5.10    5.10    5.74    5.08    5.73
    4    4.10    4.33    4.29    5.21    3.79    4.25
    6    3.78    3.85    3.86    4.76    2.54    2.85
    8    3.64    1.89    3.76    4.50    1.92    2.16
   12    3.56    2.21    3.75    4.26    1.26    1.42
   16    3.36    0.83    1.06    4.16    0.95    1.07
   20    3.39    1.42    1.33    4.07    0.75    0.85
   24    3.23    0.66    1.72    4.22    0.62    0.70
   28    3.18    1.09    2.04    4.20    0.54    0.61
   32    3.16    0.47    0.41    0.41    0.47    0.53
   34    3.16    0.67    0.61    0.56    0.44    0.50
   38    3.19    0.95    0.95    0.82    0.40    0.45
   42    3.09    0.58    1.21    1.13    0.36    0.40

text sizes are not much different:

         1389    1837    2125    1629    1721    1689

the AVX2 size is large because we completely peel the scalar epilogue,
same for the SSE case.  The scalar epilogue of the 512 loop iterates
32 times (too many for peeling), the masked loop/epilogue are quite
large due to the EVEX encoded instructions so the saved scalar/vector
epilogues do not show.

The AVX512 masked epilogue case now looks like:

        .p2align 3
.L5:
        vmovdqu8        (%r8,%rax), %zmm0
        vpavgb  (%rsi,%rax), %zmm0, %zmm0
        vmovdqu8        %zmm0, (%rdi,%rax)
        addq    $64, %rax
        cmpq    %rcx, %rax
        jne     .L5
        movl    %edx, %ecx
        andl    $-64, %ecx
        testb   $63, %dl
        je      .L19
.L4:
        movl    %ecx, %eax
        subl    %ecx, %edx
        movl    $255, %ecx
        cmpl    %ecx, %edx
        cmova   %ecx, %edx
        vpbroadcastb    %edx, %zmm0
        vpcmpub $6, .LC0(%rip), %zmm0, %k1
        vmovdqu8        (%rsi,%rax), %zmm0{%k1}{z}
        vmovdqu8        (%r8,%rax), %zmm1{%k1}{z}
        vpavgb  %zmm1, %zmm0, %zmm0
        vmovdqu8        %zmm0, (%rdi,%rax){%k1}
.L19:
        vzeroupper
        ret

where there's a missed optimization around the saturation to 255.

The fully masked AVX512 loop is

        vmovdqa64       .LC0(%rip), %zmm3
        movl    $255, %eax
        cmpl    %eax, %ecx=20
        cmovbe  %ecx, %eax
        vpbroadcastb    %eax, %zmm0
        vpcmpub $6, %zmm3, %zmm0, %k1
        .p2align 4
        .p2align 3
.L4:
        vmovdqu8        (%rsi,%rax), %zmm1{%k1}
        vmovdqu8        (%r8,%rax), %zmm2{%k1}
        movl    %r10d, %edx
        movl    $255, %ecx
        subl    %eax, %edx
        cmpl    %ecx, %edx
        cmova   %ecx, %edx
        vpavgb  %zmm2, %zmm1, %zmm0
        vmovdqu8        %zmm0, (%rdi,%rax){%k1}
        vpbroadcastb    %edx, %zmm0
        addq    $64, %rax
        movl    %r9d, %edx
        subl    %eax, %edx
        vpcmpub $6, %zmm3, %zmm0, %k1
        cmpl    $64, %edx
        ja      .L4
        vzeroupper
        ret

which is a much larger loop body due to the mask creation.  At least
that interleaves nicely (dependence wise) with the loop control and
vectorized stmts.  What needs to be optimized somehow is what IVOPTs
makes out of the decreasing remaining scalar iters IV with the=20
IV required for the memory accesses.  Without IVOPTs the body looks
like

.L4:
        vmovdqu8        (%rsi), %zmm1{%k1}
        vmovdqu8        (%rdx), %zmm2{%k1}
        movl    $255, %eax
        movl    %ecx, %r8d
        subl    $64, %ecx
        addq    $64, %rsi
        addq    $64, %rdx
        vpavgb  %zmm2, %zmm1, %zmm0
        vmovdqu8        %zmm0, (%rdi){%k1}
        addq    $64, %rdi
        cmpl    %eax, %ecx
        cmovbe  %ecx, %eax
        vpbroadcastb    %eax, %zmm0
        vpcmpub $6, %zmm3, %zmm0, %k1
        cmpl    $64, %r8d
        ja      .L4

and the key thing to optimize is

  ivtmp_78 =3D ivtmp_77 + 4294967232; // -64
  _79 =3D MIN_EXPR <ivtmp_78, 255>;
  _80 =3D (unsigned char) _79;
  _81 =3D {_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,=
 _80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, =
_80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, =
_80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, =
_80,
_80, _80};

that is we want to broadcast a saturated (to vector element precision) valu=
e.=