From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by sourceware.org (Postfix) with ESMTP id 315213858C39 for ; Thu, 15 Jul 2021 14:57:29 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 315213858C39 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id AE33F6D; Thu, 15 Jul 2021 07:57:28 -0700 (PDT) Received: from localhost (e121540-lin.manchester.arm.com [10.32.98.126]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 00E703F774; Thu, 15 Jul 2021 07:57:27 -0700 (PDT) From: Richard Sandiford To: Richard Biener Mail-Followup-To: Richard Biener , Hongtao Liu , Hongtao Liu , GCC Patches , richard.sandiford@arm.com Cc: Hongtao Liu , Hongtao Liu , GCC Patches Subject: Re: [PATCH 2/2][RFC] Add loop masking support for x86 References: <73rrp0p-859r-oq2n-pss7-6744807s3qr5@fhfr.qr> Date: Thu, 15 Jul 2021 15:57:26 +0100 In-Reply-To: (Richard Biener's message of "Thu, 15 Jul 2021 13:48:38 +0200 (CEST)") Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable X-Spam-Status: No, score=-6.1 required=5.0 tests=BAYES_00, KAM_DMARC_STATUS, KAM_NUMSUBJECT, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=no autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 15 Jul 2021 14:57:31 -0000 Richard Biener writes: > On Thu, 15 Jul 2021, Hongtao Liu wrote: > >> On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches >> wrote: >> > >> > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener wr= ote: >> > > >> > > The following extends the existing loop masking support using >> > > SVE WHILE_ULT to x86 by proving an alternate way to produce the >> > > mask using VEC_COND_EXPRs. So with --param vect-partial-vector-usage >> > > you can now enable masked vectorized epilogues (=3D1) or fully >> > > masked vector loops (=3D2). >> > > >> > > What's missing is using a scalar IV for the loop control >> > > (but in principle AVX512 can use the mask here - just the patch >> > > doesn't seem to work for AVX512 yet for some reason - likely >> > > expand_vec_cond_expr_p doesn't work there). What's also missing >> > > is providing more support for predicated operations in the case >> > > of reductions either via VEC_COND_EXPRs or via implementing >> > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping >> > > to masked AVX512 operations. >> > > >> > > For AVX2 and >> > > >> > > int foo (unsigned *a, unsigned * __restrict b, int n) >> > > { >> > > unsigned sum =3D 1; >> > > for (int i =3D 0; i < n; ++i) >> > > b[i] +=3D a[i]; >> > > return sum; >> > > } >> > > >> > > we get >> > > >> > > .L3: >> > > vpmaskmovd (%rsi,%rax), %ymm0, %ymm3 >> > > vpmaskmovd (%rdi,%rax), %ymm0, %ymm1 >> > > addl $8, %edx >> > > vpaddd %ymm3, %ymm1, %ymm1 >> > > vpmaskmovd %ymm1, %ymm0, (%rsi,%rax) >> > > vmovd %edx, %xmm1 >> > > vpsubd %ymm15, %ymm2, %ymm0 >> > > addq $32, %rax >> > > vpbroadcastd %xmm1, %ymm1 >> > > vpaddd %ymm4, %ymm1, %ymm1 >> > > vpsubd %ymm15, %ymm1, %ymm1 >> > > vpcmpgtd %ymm1, %ymm0, %ymm0 >> > > vptest %ymm0, %ymm0 >> > > jne .L3 >> > > >> > > for the fully masked loop body and for the masked epilogue >> > > we see >> > > >> > > .L4: >> > > vmovdqu (%rsi,%rax), %ymm3 >> > > vpaddd (%rdi,%rax), %ymm3, %ymm0 >> > > vmovdqu %ymm0, (%rsi,%rax) >> > > addq $32, %rax >> > > cmpq %rax, %rcx >> > > jne .L4 >> > > movl %edx, %eax >> > > andl $-8, %eax >> > > testb $7, %dl >> > > je .L11 >> > > .L3: >> > > subl %eax, %edx >> > > vmovdqa .LC0(%rip), %ymm1 >> > > salq $2, %rax >> > > vmovd %edx, %xmm0 >> > > movl $-2147483648, %edx >> > > addq %rax, %rsi >> > > vmovd %edx, %xmm15 >> > > vpbroadcastd %xmm0, %ymm0 >> > > vpbroadcastd %xmm15, %ymm15 >> > > vpsubd %ymm15, %ymm1, %ymm1 >> > > vpsubd %ymm15, %ymm0, %ymm0 >> > > vpcmpgtd %ymm1, %ymm0, %ymm0 >> > > vpmaskmovd (%rsi), %ymm0, %ymm1 >> > > vpmaskmovd (%rdi,%rax), %ymm0, %ymm2 >> > > vpaddd %ymm2, %ymm1, %ymm1 >> > > vpmaskmovd %ymm1, %ymm0, (%rsi) >> > > .L11: >> > > vzeroupper >> > > >> > > compared to >> > > >> > > .L3: >> > > movl %edx, %r8d >> > > subl %eax, %r8d >> > > leal -1(%r8), %r9d >> > > cmpl $2, %r9d >> > > jbe .L6 >> > > leaq (%rcx,%rax,4), %r9 >> > > vmovdqu (%rdi,%rax,4), %xmm2 >> > > movl %r8d, %eax >> > > andl $-4, %eax >> > > vpaddd (%r9), %xmm2, %xmm0 >> > > addl %eax, %esi >> > > andl $3, %r8d >> > > vmovdqu %xmm0, (%r9) >> > > je .L2 >> > > .L6: >> > > movslq %esi, %r8 >> > > leaq 0(,%r8,4), %rax >> > > movl (%rdi,%r8,4), %r8d >> > > addl %r8d, (%rcx,%rax) >> > > leal 1(%rsi), %r8d >> > > cmpl %r8d, %edx >> > > jle .L2 >> > > addl $2, %esi >> > > movl 4(%rdi,%rax), %r8d >> > > addl %r8d, 4(%rcx,%rax) >> > > cmpl %esi, %edx >> > > jle .L2 >> > > movl 8(%rdi,%rax), %edx >> > > addl %edx, 8(%rcx,%rax) >> > > .L2: >> > > >> > > I'm giving this a little testing right now but will dig on why >> > > I don't get masked loops when AVX512 is enabled. >> > >> > Ah, a simple thinko - rgroup_controls vectypes seem to be >> > always VECTOR_BOOLEAN_TYPE_P and thus we can >> > use expand_vec_cmp_expr_p. The AVX512 fully masked >> > loop then looks like >> > >> > .L3: >> > vmovdqu32 (%rsi,%rax,4), %ymm2{%k1} >> > vmovdqu32 (%rdi,%rax,4), %ymm1{%k1} >> > vpaddd %ymm2, %ymm1, %ymm0 >> > vmovdqu32 %ymm0, (%rsi,%rax,4){%k1} >> > addq $8, %rax >> > vpbroadcastd %eax, %ymm0 >> > vpaddd %ymm4, %ymm0, %ymm0 >> > vpcmpud $6, %ymm0, %ymm3, %k1 >> > kortestb %k1, %k1 >> > jne .L3 >> > >> > I guess for x86 it's not worth preserving the VEC_COND_EXPR >> > mask generation but other archs may not provide all required vec_cmp >> > expanders. >>=20 >> For the main loop, the full-masked loop's codegen seems much worse. >> Basically, we need at least 4 instructions to do what while_ult in arm d= oes. >>=20 >> vpbroadcastd %eax, %ymm0 >> vpaddd %ymm4, %ymm0, %ymm0 >> vpcmpud $6, %ymm0, %ymm3, %k1 >> kortestb %k1, %k1 >> vs >> whilelo(or some other while) >>=20 >> more instructions are needed for avx2 since there's no direct >> instruction for .COND_{ADD,SUB..} >>=20 >> original >> .L4: >> vmovdqu (%rcx,%rax), %ymm1 >> vpaddd (%rdi,%rax), %ymm1, %ymm0 >> vmovdqu %ymm0, (%rcx,%rax) >> addq $32, %rax >> cmpq %rax, %rsi >> jne .L4 >>=20 >> vs >> avx512 full-masked loop >> .L3: >> vmovdqu32 (%rsi,%rax,4), %ymm2{%k1} >> vmovdqu32 (%rdi,%rax,4), %ymm1{%k1} >> vpaddd %ymm2, %ymm1, %ymm0 >> vmovdqu32 %ymm0, (%rsi,%rax,4){%k1} >> addq $8, %rax >> vpbroadcastd %eax, %ymm0 >> vpaddd %ymm4, %ymm0, %ymm0 >> vpcmpud $6, %ymm0, %ymm3, %k1 >> kortestb %k1, %k1 >> jne .L3 >>=20 >> vs >> avx2 full-masked loop >> .L3: >> vpmaskmovd (%rsi,%rax), %ymm0, %ymm3 >> vpmaskmovd (%rdi,%rax), %ymm0, %ymm1 >> addl $8, %edx >> vpaddd %ymm3, %ymm1, %ymm1 >> vpmaskmovd %ymm1, %ymm0, (%rsi,%rax) >> vmovd %edx, %xmm1 >> vpsubd %ymm15, %ymm2, %ymm0 >> addq $32, %rax >> vpbroadcastd %xmm1, %ymm1 >> vpaddd %ymm4, %ymm1, %ymm1 >> vpsubd %ymm15, %ymm1, %ymm1 >> vpcmpgtd %ymm1, %ymm0, %ymm0 >> vptest %ymm0, %ymm0 >> jne .L3 >>=20 >> vs arm64's code >>=20 >> .L3: >> ld1w z1.s, p0/z, [x1, x3, lsl 2] >> ld1w z0.s, p0/z, [x0, x3, lsl 2] >> add z0.s, z0.s, z1.s >> st1w z0.s, p0, [x1, x3, lsl 2] >> add x3, x3, x4 >> whilelo p0.s, w3, w2 >> b.any .L3 > > Yes, that's true - it might still be OK for vectorizing epilogues > and thus --param vect-partial-vector-usage=3D1 > > Can AVX512 do any better than this? > > vpbroadcastd %eax, %ymm0 > vpaddd %ymm4, %ymm0, %ymm0 > vpcmpud $6, %ymm0, %ymm3, %k1 > > Note with multiple types involved things get even worse > since you need masks for each vector mode. But as far as > I can see that's the same for SVE (but there we have the > single-instruction whilelo). I guess we'll also generate > wrong code at the moment for the case where we need > multiple vectors to hold the full mask - vect_gen_while > doesn't seem to be prepared for this? > > So with > > int foo (unsigned long *a, unsigned * __restrict b, int n) > { > unsigned sum =3D 1; > for (int i =3D 0; i < n; ++i) > { > b[i] +=3D a[i]; > } > return sum; > } > > SVE uses > > .L3: > ld1d z0.d, p0/z, [x1, x3, lsl 3] > ld1d z1.d, p0/z, [x0, x3, lsl 3] > adr z0.d, [z0.d, z1.d, lsl 2] > st1d z0.d, p0, [x1, x3, lsl 3] > add x3, x3, x4 > whilelo p0.d, w3, w2 > b.any .L3 > > so p0 vs. p0/z, whatever that means and it looks like > the vector add can somehow concatenate z0.d and z1.d. > Truly fascinating ;) It looks like this is from a version in which =E2=80=9Cb=E2=80=9D was also = long. For the loop above I get: .L3: ld1d z0.d, p0/z, [x0, x3, lsl 3] ld1w z1.d, p0/z, [x1, x3, lsl 2] add z0.s, z0.s, z1.s st1w z0.d, p0, [x1, x3, lsl 2] add x3, x3, x4 whilelo p0.d, w3, w2 b.any .L3 where the ld1w is an extending load and the st1w is a truncating store. But yeah, there's a hard-coded assumption that a mask for 2 SIs can also control 1 DI, etc. For example: void foo (unsigned long *a, unsigned * __restrict b, int n) { for (int i =3D 0; i < n; ++i) { a[i * 2] +=3D 1; a[i * 2 + 1] +=3D 2; b[i * 4] +=3D 1; b[i * 4 + 1] +=3D 2; b[i * 4 + 2] +=3D 3; b[i * 4 + 3] +=3D 4; } } becomes: .L3: ld1d z0.d, p0/z, [x0] add z0.d, z0.d, z2.d st1d z0.d, p0, [x0] ld1w z0.s, p0/z, [x1, x3, lsl 2] add z0.s, z0.s, z1.s st1w z0.s, p0, [x1, x3, lsl 2] add x0, x0, x4 add x3, x3, x5 whilelo p0.s, x3, x2 b.any .L3 This is explained more in the big comment above rgroup_controls (guess you've already seen it). > It looks like --param vect_partial_vector_usage defaults to 2, > power forces it to 1 (power10) or 0 otherwise. > > I think we'd need a target hook that toggles this per mode > so we could tune this dependent on AVX512 vectorization vs. AVX2. Yeah, keying it off the architecture made sense for Power because the only purpose of len_load/store is to support partial vectorisation. But I agree a hook is needed now that we have MASK_LOAD/STORE for general use and are emulating WHILE_ULT. Thanks, Richard