From: Richard Biener <rguenther@suse.de>
To: gcc-patches@gcc.gnu.org
Cc: richard.sandiford@arm.com, hongtao.liu@intel.com
Subject: [PATCH 2/2][RFC] Add loop masking support for x86
Date: Thu, 15 Jul 2021 12:30:20 +0200 (CEST) [thread overview]
Message-ID: <73rrp0p-859r-oq2n-pss7-6744807s3qr5@fhfr.qr> (raw)
The following extends the existing loop masking support using
SVE WHILE_ULT to x86 by proving an alternate way to produce the
mask using VEC_COND_EXPRs. So with --param vect-partial-vector-usage
you can now enable masked vectorized epilogues (=1) or fully
masked vector loops (=2).
What's missing is using a scalar IV for the loop control
(but in principle AVX512 can use the mask here - just the patch
doesn't seem to work for AVX512 yet for some reason - likely
expand_vec_cond_expr_p doesn't work there). What's also missing
is providing more support for predicated operations in the case
of reductions either via VEC_COND_EXPRs or via implementing
some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
to masked AVX512 operations.
For AVX2 and
int foo (unsigned *a, unsigned * __restrict b, int n)
{
unsigned sum = 1;
for (int i = 0; i < n; ++i)
b[i] += a[i];
return sum;
}
we get
.L3:
vpmaskmovd (%rsi,%rax), %ymm0, %ymm3
vpmaskmovd (%rdi,%rax), %ymm0, %ymm1
addl $8, %edx
vpaddd %ymm3, %ymm1, %ymm1
vpmaskmovd %ymm1, %ymm0, (%rsi,%rax)
vmovd %edx, %xmm1
vpsubd %ymm15, %ymm2, %ymm0
addq $32, %rax
vpbroadcastd %xmm1, %ymm1
vpaddd %ymm4, %ymm1, %ymm1
vpsubd %ymm15, %ymm1, %ymm1
vpcmpgtd %ymm1, %ymm0, %ymm0
vptest %ymm0, %ymm0
jne .L3
for the fully masked loop body and for the masked epilogue
we see
.L4:
vmovdqu (%rsi,%rax), %ymm3
vpaddd (%rdi,%rax), %ymm3, %ymm0
vmovdqu %ymm0, (%rsi,%rax)
addq $32, %rax
cmpq %rax, %rcx
jne .L4
movl %edx, %eax
andl $-8, %eax
testb $7, %dl
je .L11
.L3:
subl %eax, %edx
vmovdqa .LC0(%rip), %ymm1
salq $2, %rax
vmovd %edx, %xmm0
movl $-2147483648, %edx
addq %rax, %rsi
vmovd %edx, %xmm15
vpbroadcastd %xmm0, %ymm0
vpbroadcastd %xmm15, %ymm15
vpsubd %ymm15, %ymm1, %ymm1
vpsubd %ymm15, %ymm0, %ymm0
vpcmpgtd %ymm1, %ymm0, %ymm0
vpmaskmovd (%rsi), %ymm0, %ymm1
vpmaskmovd (%rdi,%rax), %ymm0, %ymm2
vpaddd %ymm2, %ymm1, %ymm1
vpmaskmovd %ymm1, %ymm0, (%rsi)
.L11:
vzeroupper
compared to
.L3:
movl %edx, %r8d
subl %eax, %r8d
leal -1(%r8), %r9d
cmpl $2, %r9d
jbe .L6
leaq (%rcx,%rax,4), %r9
vmovdqu (%rdi,%rax,4), %xmm2
movl %r8d, %eax
andl $-4, %eax
vpaddd (%r9), %xmm2, %xmm0
addl %eax, %esi
andl $3, %r8d
vmovdqu %xmm0, (%r9)
je .L2
.L6:
movslq %esi, %r8
leaq 0(,%r8,4), %rax
movl (%rdi,%r8,4), %r8d
addl %r8d, (%rcx,%rax)
leal 1(%rsi), %r8d
cmpl %r8d, %edx
jle .L2
addl $2, %esi
movl 4(%rdi,%rax), %r8d
addl %r8d, 4(%rcx,%rax)
cmpl %esi, %edx
jle .L2
movl 8(%rdi,%rax), %edx
addl %edx, 8(%rcx,%rax)
.L2:
I'm giving this a little testing right now but will dig on why
I don't get masked loops when AVX512 is enabled.
Still comments are appreciated.
Thanks,
Richard.
2021-07-15 Richard Biener <rguenther@suse.de>
* tree-vect-stmts.c (can_produce_all_loop_masks_p): We
also can produce masks with VEC_COND_EXPRs.
* tree-vect-loop.c (vect_gen_while): Generate the mask
with a VEC_COND_EXPR in case WHILE_ULT is not supported.
---
gcc/tree-vect-loop.c | 8 ++++++-
gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++---------
2 files changed, 47 insertions(+), 11 deletions(-)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index fc3dab0d143..2214ed11dfb 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
{
rgroup_controls *rgm;
unsigned int i;
+ tree cmp_vectype;
FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
if (rgm->type != NULL_TREE
&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
cmp_type, rgm->type,
- OPTIMIZE_FOR_SPEED))
+ OPTIMIZE_FOR_SPEED)
+ && ((cmp_vectype
+ = truth_type_for (build_vector_type
+ (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type)))),
+ true)
+ && !expand_vec_cond_expr_p (rgm->type, cmp_vectype, LT_EXPR))
return false;
return true;
}
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 6a25d661800..216986399b1 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -12007,16 +12007,46 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
tree end_index, const char *name)
{
tree cmp_type = TREE_TYPE (start_index);
- gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
- cmp_type, mask_type,
- OPTIMIZE_FOR_SPEED));
- gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
- start_index, end_index,
- build_zero_cst (mask_type));
- tree tmp = make_temp_ssa_name (mask_type, NULL, name);
- gimple_call_set_lhs (call, tmp);
- gimple_seq_add_stmt (seq, call);
- return tmp;
+ if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
+ cmp_type, mask_type,
+ OPTIMIZE_FOR_SPEED))
+ {
+ gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
+ start_index, end_index,
+ build_zero_cst (mask_type));
+ tree tmp = make_temp_ssa_name (mask_type, NULL, name);
+ gimple_call_set_lhs (call, tmp);
+ gimple_seq_add_stmt (seq, call);
+ return tmp;
+ }
+ else
+ {
+ /* Generate
+ _1 = { start_index, start_index, ... };
+ _2 = { end_index, end_index, ... };
+ _3 = _1 + { 0, 1, 2 ... };
+ _4 = _3 < _2;
+ _5 = VEC_COND_EXPR <_4, { -1, -1, ... } : { 0, 0, ... }>; */
+ tree cvectype = build_vector_type (cmp_type,
+ TYPE_VECTOR_SUBPARTS (mask_type));
+ tree si = make_ssa_name (cvectype);
+ gassign *ass = gimple_build_assign
+ (si, build_vector_from_val (cvectype, start_index));
+ gimple_seq_add_stmt (seq, ass);
+ tree ei = make_ssa_name (cvectype);
+ ass = gimple_build_assign (ei,
+ build_vector_from_val (cvectype, end_index));
+ gimple_seq_add_stmt (seq, ass);
+ tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
+ build_one_cst (cmp_type));
+ si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
+ tree cmp = gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
+ si, ei);
+ tree mask = gimple_build (seq, VEC_COND_EXPR, mask_type, cmp,
+ build_all_ones_cst (mask_type),
+ build_zero_cst (mask_type));
+ return mask;
+ }
}
/* Generate a vector mask of type MASK_TYPE for which index I is false iff
--
2.26.2
next reply other threads:[~2021-07-15 10:30 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-07-15 10:30 Richard Biener [this message]
2021-07-15 10:45 ` Richard Biener
2021-07-15 11:20 ` Hongtao Liu
2021-07-15 11:48 ` Richard Biener
2021-07-15 14:57 ` Richard Sandiford
2021-07-15 15:15 ` Richard Biener
2021-07-15 15:31 ` Richard Biener
2021-07-16 9:11 ` Richard Biener
2021-07-20 4:20 ` Hongtao Liu
2021-07-20 7:38 ` Richard Biener
2021-07-20 11:07 ` Hongtao Liu
2021-07-20 11:09 ` Richard Biener
2021-07-21 7:57 ` Hongtao Liu
2021-07-21 8:16 ` Richard Biener
2021-07-21 9:38 ` Hongtao Liu
2021-07-21 10:13 ` Richard Biener
2021-07-16 1:46 ` Hongtao Liu
2021-07-16 6:09 ` Richard Biener
2021-07-15 13:49 ` Richard Sandiford
2021-07-15 13:54 ` Richard Biener
2021-07-20 13:48 ` Richard Biener
2021-07-21 6:17 ` Richard Biener
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=73rrp0p-859r-oq2n-pss7-6744807s3qr5@fhfr.qr \
--to=rguenther@suse.de \
--cc=gcc-patches@gcc.gnu.org \
--cc=hongtao.liu@intel.com \
--cc=richard.sandiford@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).