From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <rguenther@suse.de>
Received: from smtp-out2.suse.de (smtp-out2.suse.de [195.135.220.29])
 by sourceware.org (Postfix) with ESMTPS id 35D24386185D
 for <gcc-patches@gcc.gnu.org>; Thu, 15 Jul 2021 10:30:22 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 35D24386185D
Authentication-Results: sourceware.org;
 dmarc=none (p=none dis=none) header.from=suse.de
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=suse.de
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512)
 (No client certificate requested)
 by smtp-out2.suse.de (Postfix) with ESMTPS id 2556E1FE0C;
 Thu, 15 Jul 2021 10:30:21 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.de; s=susede2_rsa;
 t=1626345021; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
 mime-version:mime-version:content-type:content-type;
 bh=PGID5wjucP7KW4uVdHktTUsb7cSkb9pjS/85eaQdMug=;
 b=a1Ks3HTIFjzfhOS6beCZd9hVzgzUK5aPJ+//vpDvpKnrSJhZThkzow0o1sCVt4O8UiSaOk
 AC/GEmfn2bTeAQVBUSRRF+esanGTGvR0Jmfz6brF+8j86SfSiDbV7KeEnfw35MgDeTuSUW
 437Lr17/R3UbSbvMLOCyjHRE/beCrcg=
DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=suse.de;
 s=susede2_ed25519; t=1626345021;
 h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
 mime-version:mime-version:content-type:content-type;
 bh=PGID5wjucP7KW4uVdHktTUsb7cSkb9pjS/85eaQdMug=;
 b=ofZWRcTuCiLOxa7j8kaFf0qHCAoORNuypnjopHPY9eLE4lxWzmOo4KEiDK4fnbu+Bo7fZN
 Js69MKEhHRNr4ECQ==
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512)
 (No client certificate requested)
 by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 0006213C2F;
 Thu, 15 Jul 2021 10:30:20 +0000 (UTC)
Received: from dovecot-director2.suse.de ([192.168.254.65])
 by imap2.suse-dmz.suse.de with ESMTPSA id 2wMCOjwO8GAAGgAAMHmgww
 (envelope-from <rguenther@suse.de>); Thu, 15 Jul 2021 10:30:20 +0000
Date: Thu, 15 Jul 2021 12:30:20 +0200 (CEST)
From: Richard Biener <rguenther@suse.de>
To: gcc-patches@gcc.gnu.org
cc: richard.sandiford@arm.com, hongtao.liu@intel.com
Subject: [PATCH 2/2][RFC] Add loop masking support for x86
Message-ID: <73rrp0p-859r-oq2n-pss7-6744807s3qr5@fhfr.qr>
MIME-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
X-Spam-Status: No, score=-11.0 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, KAM_NUMSUBJECT,
 SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Thu, 15 Jul 2021 10:30:23 -0000

The following extends the existing loop masking support using
SVE WHILE_ULT to x86 by proving an alternate way to produce the
mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
you can now enable masked vectorized epilogues (=1) or fully
masked vector loops (=2).

What's missing is using a scalar IV for the loop control
(but in principle AVX512 can use the mask here - just the patch
doesn't seem to work for AVX512 yet for some reason - likely
expand_vec_cond_expr_p doesn't work there).  What's also missing
is providing more support for predicated operations in the case
of reductions either via VEC_COND_EXPRs or via implementing
some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
to masked AVX512 operations.

For AVX2 and

int foo (unsigned *a, unsigned * __restrict b, int n)
{
  unsigned sum = 1;
  for (int i = 0; i < n; ++i)
    b[i] += a[i];
  return sum;
}

we get

.L3:
        vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
        vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
        addl    $8, %edx
        vpaddd  %ymm3, %ymm1, %ymm1
        vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
        vmovd   %edx, %xmm1
        vpsubd  %ymm15, %ymm2, %ymm0
        addq    $32, %rax
        vpbroadcastd    %xmm1, %ymm1
        vpaddd  %ymm4, %ymm1, %ymm1
        vpsubd  %ymm15, %ymm1, %ymm1
        vpcmpgtd        %ymm1, %ymm0, %ymm0
        vptest  %ymm0, %ymm0
        jne     .L3

for the fully masked loop body and for the masked epilogue
we see

.L4:
        vmovdqu (%rsi,%rax), %ymm3
        vpaddd  (%rdi,%rax), %ymm3, %ymm0
        vmovdqu %ymm0, (%rsi,%rax)
        addq    $32, %rax
        cmpq    %rax, %rcx
        jne     .L4
        movl    %edx, %eax
        andl    $-8, %eax
        testb   $7, %dl
        je      .L11
.L3:
        subl    %eax, %edx
        vmovdqa .LC0(%rip), %ymm1
        salq    $2, %rax
        vmovd   %edx, %xmm0
        movl    $-2147483648, %edx
        addq    %rax, %rsi
        vmovd   %edx, %xmm15
        vpbroadcastd    %xmm0, %ymm0
        vpbroadcastd    %xmm15, %ymm15
        vpsubd  %ymm15, %ymm1, %ymm1
        vpsubd  %ymm15, %ymm0, %ymm0
        vpcmpgtd        %ymm1, %ymm0, %ymm0
        vpmaskmovd      (%rsi), %ymm0, %ymm1
        vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
        vpaddd  %ymm2, %ymm1, %ymm1
        vpmaskmovd      %ymm1, %ymm0, (%rsi)
.L11:
        vzeroupper

compared to

.L3:
        movl    %edx, %r8d
        subl    %eax, %r8d
        leal    -1(%r8), %r9d
        cmpl    $2, %r9d
        jbe     .L6
        leaq    (%rcx,%rax,4), %r9
        vmovdqu (%rdi,%rax,4), %xmm2
        movl    %r8d, %eax
        andl    $-4, %eax
        vpaddd  (%r9), %xmm2, %xmm0
        addl    %eax, %esi
        andl    $3, %r8d
        vmovdqu %xmm0, (%r9)
        je      .L2
.L6:
        movslq  %esi, %r8
        leaq    0(,%r8,4), %rax
        movl    (%rdi,%r8,4), %r8d
        addl    %r8d, (%rcx,%rax)
        leal    1(%rsi), %r8d
        cmpl    %r8d, %edx
        jle     .L2
        addl    $2, %esi
        movl    4(%rdi,%rax), %r8d
        addl    %r8d, 4(%rcx,%rax)
        cmpl    %esi, %edx
        jle     .L2
        movl    8(%rdi,%rax), %edx
        addl    %edx, 8(%rcx,%rax)
.L2:

I'm giving this a little testing right now but will dig on why
I don't get masked loops when AVX512 is enabled.

Still comments are appreciated.

Thanks,
Richard.

2021-07-15  Richard Biener  <rguenther@suse.de>

	* tree-vect-stmts.c (can_produce_all_loop_masks_p): We
	also can produce masks with VEC_COND_EXPRs.
	* tree-vect-loop.c (vect_gen_while): Generate the mask
	with a VEC_COND_EXPR in case WHILE_ULT is not supported.
---
 gcc/tree-vect-loop.c  |  8 ++++++-
 gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index fc3dab0d143..2214ed11dfb 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 {
   rgroup_controls *rgm;
   unsigned int i;
+  tree cmp_vectype;
   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
     if (rgm->type != NULL_TREE
 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 					    cmp_type, rgm->type,
-					    OPTIMIZE_FOR_SPEED))
+					    OPTIMIZE_FOR_SPEED)
+	&& ((cmp_vectype
+	       = truth_type_for (build_vector_type
+				 (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type)))),
+	    true)
+	&& !expand_vec_cond_expr_p (rgm->type, cmp_vectype, LT_EXPR))
       return false;
   return true;
 }
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 6a25d661800..216986399b1 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -12007,16 +12007,46 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
 		tree end_index, const char *name)
 {
   tree cmp_type = TREE_TYPE (start_index);
-  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
-						       cmp_type, mask_type,
-						       OPTIMIZE_FOR_SPEED));
-  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
-					    start_index, end_index,
-					    build_zero_cst (mask_type));
-  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
-  gimple_call_set_lhs (call, tmp);
-  gimple_seq_add_stmt (seq, call);
-  return tmp;
+  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
+				      cmp_type, mask_type,
+				      OPTIMIZE_FOR_SPEED))
+    {
+      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
+						start_index, end_index,
+						build_zero_cst (mask_type));
+      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
+      gimple_call_set_lhs (call, tmp);
+      gimple_seq_add_stmt (seq, call);
+      return tmp;
+    }
+  else
+    {
+      /* Generate
+	   _1 = { start_index, start_index, ... };
+	   _2 = { end_index, end_index, ... };
+	   _3 = _1 + { 0, 1, 2 ... };
+	   _4 = _3 < _2;
+	   _5 = VEC_COND_EXPR <_4, { -1, -1, ... } : { 0, 0, ... }>;   */
+      tree cvectype = build_vector_type (cmp_type,
+					 TYPE_VECTOR_SUBPARTS (mask_type));
+      tree si = make_ssa_name (cvectype);
+      gassign *ass = gimple_build_assign
+			(si, build_vector_from_val (cvectype, start_index));
+      gimple_seq_add_stmt (seq, ass);
+      tree ei = make_ssa_name (cvectype);
+      ass = gimple_build_assign (ei,
+				 build_vector_from_val (cvectype, end_index));
+      gimple_seq_add_stmt (seq, ass);
+      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
+				    build_one_cst (cmp_type));
+      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
+      tree cmp = gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
+			       si, ei);
+      tree mask = gimple_build (seq, VEC_COND_EXPR, mask_type, cmp,
+				build_all_ones_cst (mask_type),
+				build_zero_cst (mask_type));
+      return mask;
+    }
 }
 
 /* Generate a vector mask of type MASK_TYPE for which index I is false iff
-- 
2.26.2