From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp-out2.suse.de (smtp-out2.suse.de [195.135.220.29]) by sourceware.org (Postfix) with ESMTPS id 35D24386185D for ; Thu, 15 Jul 2021 10:30:22 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 35D24386185D Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=suse.de Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=suse.de Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by smtp-out2.suse.de (Postfix) with ESMTPS id 2556E1FE0C; Thu, 15 Jul 2021 10:30:21 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.de; s=susede2_rsa; t=1626345021; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc: mime-version:mime-version:content-type:content-type; bh=PGID5wjucP7KW4uVdHktTUsb7cSkb9pjS/85eaQdMug=; b=a1Ks3HTIFjzfhOS6beCZd9hVzgzUK5aPJ+//vpDvpKnrSJhZThkzow0o1sCVt4O8UiSaOk AC/GEmfn2bTeAQVBUSRRF+esanGTGvR0Jmfz6brF+8j86SfSiDbV7KeEnfw35MgDeTuSUW 437Lr17/R3UbSbvMLOCyjHRE/beCrcg= DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=suse.de; s=susede2_ed25519; t=1626345021; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc: mime-version:mime-version:content-type:content-type; bh=PGID5wjucP7KW4uVdHktTUsb7cSkb9pjS/85eaQdMug=; b=ofZWRcTuCiLOxa7j8kaFf0qHCAoORNuypnjopHPY9eLE4lxWzmOo4KEiDK4fnbu+Bo7fZN Js69MKEhHRNr4ECQ== Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 0006213C2F; Thu, 15 Jul 2021 10:30:20 +0000 (UTC) Received: from dovecot-director2.suse.de ([192.168.254.65]) by imap2.suse-dmz.suse.de with ESMTPSA id 2wMCOjwO8GAAGgAAMHmgww (envelope-from ); Thu, 15 Jul 2021 10:30:20 +0000 Date: Thu, 15 Jul 2021 12:30:20 +0200 (CEST) From: Richard Biener To: gcc-patches@gcc.gnu.org cc: richard.sandiford@arm.com, hongtao.liu@intel.com Subject: [PATCH 2/2][RFC] Add loop masking support for x86 Message-ID: <73rrp0p-859r-oq2n-pss7-6744807s3qr5@fhfr.qr> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII X-Spam-Status: No, score=-11.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, KAM_NUMSUBJECT, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 15 Jul 2021 10:30:23 -0000 The following extends the existing loop masking support using SVE WHILE_ULT to x86 by proving an alternate way to produce the mask using VEC_COND_EXPRs. So with --param vect-partial-vector-usage you can now enable masked vectorized epilogues (=1) or fully masked vector loops (=2). What's missing is using a scalar IV for the loop control (but in principle AVX512 can use the mask here - just the patch doesn't seem to work for AVX512 yet for some reason - likely expand_vec_cond_expr_p doesn't work there). What's also missing is providing more support for predicated operations in the case of reductions either via VEC_COND_EXPRs or via implementing some of the .COND_{ADD,SUB,MUL...} internal functions as mapping to masked AVX512 operations. For AVX2 and int foo (unsigned *a, unsigned * __restrict b, int n) { unsigned sum = 1; for (int i = 0; i < n; ++i) b[i] += a[i]; return sum; } we get .L3: vpmaskmovd (%rsi,%rax), %ymm0, %ymm3 vpmaskmovd (%rdi,%rax), %ymm0, %ymm1 addl $8, %edx vpaddd %ymm3, %ymm1, %ymm1 vpmaskmovd %ymm1, %ymm0, (%rsi,%rax) vmovd %edx, %xmm1 vpsubd %ymm15, %ymm2, %ymm0 addq $32, %rax vpbroadcastd %xmm1, %ymm1 vpaddd %ymm4, %ymm1, %ymm1 vpsubd %ymm15, %ymm1, %ymm1 vpcmpgtd %ymm1, %ymm0, %ymm0 vptest %ymm0, %ymm0 jne .L3 for the fully masked loop body and for the masked epilogue we see .L4: vmovdqu (%rsi,%rax), %ymm3 vpaddd (%rdi,%rax), %ymm3, %ymm0 vmovdqu %ymm0, (%rsi,%rax) addq $32, %rax cmpq %rax, %rcx jne .L4 movl %edx, %eax andl $-8, %eax testb $7, %dl je .L11 .L3: subl %eax, %edx vmovdqa .LC0(%rip), %ymm1 salq $2, %rax vmovd %edx, %xmm0 movl $-2147483648, %edx addq %rax, %rsi vmovd %edx, %xmm15 vpbroadcastd %xmm0, %ymm0 vpbroadcastd %xmm15, %ymm15 vpsubd %ymm15, %ymm1, %ymm1 vpsubd %ymm15, %ymm0, %ymm0 vpcmpgtd %ymm1, %ymm0, %ymm0 vpmaskmovd (%rsi), %ymm0, %ymm1 vpmaskmovd (%rdi,%rax), %ymm0, %ymm2 vpaddd %ymm2, %ymm1, %ymm1 vpmaskmovd %ymm1, %ymm0, (%rsi) .L11: vzeroupper compared to .L3: movl %edx, %r8d subl %eax, %r8d leal -1(%r8), %r9d cmpl $2, %r9d jbe .L6 leaq (%rcx,%rax,4), %r9 vmovdqu (%rdi,%rax,4), %xmm2 movl %r8d, %eax andl $-4, %eax vpaddd (%r9), %xmm2, %xmm0 addl %eax, %esi andl $3, %r8d vmovdqu %xmm0, (%r9) je .L2 .L6: movslq %esi, %r8 leaq 0(,%r8,4), %rax movl (%rdi,%r8,4), %r8d addl %r8d, (%rcx,%rax) leal 1(%rsi), %r8d cmpl %r8d, %edx jle .L2 addl $2, %esi movl 4(%rdi,%rax), %r8d addl %r8d, 4(%rcx,%rax) cmpl %esi, %edx jle .L2 movl 8(%rdi,%rax), %edx addl %edx, 8(%rcx,%rax) .L2: I'm giving this a little testing right now but will dig on why I don't get masked loops when AVX512 is enabled. Still comments are appreciated. Thanks, Richard. 2021-07-15 Richard Biener * tree-vect-stmts.c (can_produce_all_loop_masks_p): We also can produce masks with VEC_COND_EXPRs. * tree-vect-loop.c (vect_gen_while): Generate the mask with a VEC_COND_EXPR in case WHILE_ULT is not supported. --- gcc/tree-vect-loop.c | 8 ++++++- gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index fc3dab0d143..2214ed11dfb 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) { rgroup_controls *rgm; unsigned int i; + tree cmp_vectype; FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) if (rgm->type != NULL_TREE && !direct_internal_fn_supported_p (IFN_WHILE_ULT, cmp_type, rgm->type, - OPTIMIZE_FOR_SPEED)) + OPTIMIZE_FOR_SPEED) + && ((cmp_vectype + = truth_type_for (build_vector_type + (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type)))), + true) + && !expand_vec_cond_expr_p (rgm->type, cmp_vectype, LT_EXPR)) return false; return true; } diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 6a25d661800..216986399b1 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -12007,16 +12007,46 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index, tree end_index, const char *name) { tree cmp_type = TREE_TYPE (start_index); - gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT, - cmp_type, mask_type, - OPTIMIZE_FOR_SPEED)); - gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3, - start_index, end_index, - build_zero_cst (mask_type)); - tree tmp = make_temp_ssa_name (mask_type, NULL, name); - gimple_call_set_lhs (call, tmp); - gimple_seq_add_stmt (seq, call); - return tmp; + if (direct_internal_fn_supported_p (IFN_WHILE_ULT, + cmp_type, mask_type, + OPTIMIZE_FOR_SPEED)) + { + gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3, + start_index, end_index, + build_zero_cst (mask_type)); + tree tmp = make_temp_ssa_name (mask_type, NULL, name); + gimple_call_set_lhs (call, tmp); + gimple_seq_add_stmt (seq, call); + return tmp; + } + else + { + /* Generate + _1 = { start_index, start_index, ... }; + _2 = { end_index, end_index, ... }; + _3 = _1 + { 0, 1, 2 ... }; + _4 = _3 < _2; + _5 = VEC_COND_EXPR <_4, { -1, -1, ... } : { 0, 0, ... }>; */ + tree cvectype = build_vector_type (cmp_type, + TYPE_VECTOR_SUBPARTS (mask_type)); + tree si = make_ssa_name (cvectype); + gassign *ass = gimple_build_assign + (si, build_vector_from_val (cvectype, start_index)); + gimple_seq_add_stmt (seq, ass); + tree ei = make_ssa_name (cvectype); + ass = gimple_build_assign (ei, + build_vector_from_val (cvectype, end_index)); + gimple_seq_add_stmt (seq, ass); + tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type), + build_one_cst (cmp_type)); + si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr); + tree cmp = gimple_build (seq, LT_EXPR, truth_type_for (cvectype), + si, ei); + tree mask = gimple_build (seq, VEC_COND_EXPR, mask_type, cmp, + build_all_ones_cst (mask_type), + build_zero_cst (mask_type)); + return mask; + } } /* Generate a vector mask of type MASK_TYPE for which index I is false iff -- 2.26.2