From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by sourceware.org (Postfix) with ESMTP id EBFB03858C53 for ; Thu, 24 Aug 2023 09:38:23 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org EBFB03858C53 Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id CC7FE1042; Thu, 24 Aug 2023 02:39:03 -0700 (PDT) Received: from localhost (e121540-lin.manchester.arm.com [10.32.110.72]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 222573F762; Thu, 24 Aug 2023 02:38:23 -0700 (PDT) From: Richard Sandiford To: Richard Biener Mail-Followup-To: Richard Biener ,gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Cc: gcc-patches@gcc.gnu.org Subject: Re: [PATCH] tree-optimization/111115 - SLP of masked stores References: Date: Thu, 24 Aug 2023 10:38:21 +0100 In-Reply-To: (Richard Biener's message of "Wed, 23 Aug 2023 13:24:17 +0000 (UTC)") Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain X-Spam-Status: No, score=-25.5 required=5.0 tests=BAYES_00,GIT_PATCH_0,KAM_DMARC_NONE,KAM_DMARC_STATUS,KAM_LAZY_DOMAIN_SECURITY,SPF_HELO_NONE,SPF_NONE,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: Richard Biener writes: > The following adds the capability to do SLP on .MASK_STORE, I do not > plan to add interleaving support. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, OK? LGTM, thanks. Richard > Thanks, > Richard. > > PR tree-optimization/111115 > gcc/ > * tree-vectorizer.h (vect_slp_child_index_for_operand): New. > * tree-vect-data-refs.cc (can_group_stmts_p): Also group > .MASK_STORE. > * tree-vect-slp.cc (arg3_arg2_map): New. > (vect_get_operand_map): Handle IFN_MASK_STORE. > (vect_slp_child_index_for_operand): New function. > (vect_build_slp_tree_1): Handle statements with no LHS, > masked store ifns. > (vect_remove_slp_scalar_calls): Likewise. > * tree-vect-stmts.c (vect_check_store_rhs): Lookup the > SLP child corresponding to the ifn value index. > (vectorizable_store): Likewise for the mask index. Support > masked stores. > (vectorizable_load): Lookup the SLP child corresponding to the > ifn mask index. > > gcc/testsuite/ > * lib/target-supports.exp (check_effective_target_vect_masked_store): > Supported with check_avx_available. > * gcc.dg/vect/slp-mask-store-1.c: New testcase. > --- > gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c | 39 +++++++++++++++++ > gcc/testsuite/lib/target-supports.exp | 3 +- > gcc/tree-vect-data-refs.cc | 3 +- > gcc/tree-vect-slp.cc | 46 +++++++++++++++++--- > gcc/tree-vect-stmts.cc | 23 +++++----- > gcc/tree-vectorizer.h | 1 + > 6 files changed, 94 insertions(+), 21 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c > > diff --git a/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c b/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c > new file mode 100644 > index 00000000000..50b7066778e > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c > @@ -0,0 +1,39 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-mavx2" { target avx2 } } */ > + > +#include "tree-vect.h" > + > +void __attribute__((noipa)) > +foo (unsigned * __restrict x, int * __restrict flag) > +{ > + for (int i = 0; i < 32; ++i) > + { > + if (flag[2*i+0]) > + x[2*i+0] = x[2*i+0] + 3; > + if (flag[2*i+1]) > + x[2*i+1] = x[2*i+1] + 177; > + } > +} > + > +unsigned x[16]; > +int flag[32] = { 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; > +unsigned res[16] = { 3, 177, 0, 0, 0, 177, 3, 0, 3, 177, 0, 0, 0, 177, 3, 0 }; > + > +int > +main () > +{ > + check_vect (); > + > + foo (x, flag); > + > + if (__builtin_memcmp (x, res, sizeof (x)) != 0) > + abort (); > + for (int i = 0; i < 32; ++i) > + if (flag[i] != 0 && flag[i] != 1) > + abort (); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 1 "vect" { target { vect_masked_store && vect_masked_load } } } } */ > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp > index d4623ee6b45..d353cc0aaf0 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -8400,7 +8400,8 @@ proc check_effective_target_vect_masked_load { } { > # Return 1 if the target supports vector masked stores. > > proc check_effective_target_vect_masked_store { } { > - return [expr { [check_effective_target_aarch64_sve] > + return [expr { [check_avx_available] > + || [check_effective_target_aarch64_sve] > || [istarget amdgcn*-*-*] }] > } > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > index 3e9a284666c..a2caf6cb1c7 100644 > --- a/gcc/tree-vect-data-refs.cc > +++ b/gcc/tree-vect-data-refs.cc > @@ -3048,8 +3048,7 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, > like those created by build_mask_conversion. */ > tree mask1 = gimple_call_arg (call1, 2); > tree mask2 = gimple_call_arg (call2, 2); > - if (!operand_equal_p (mask1, mask2, 0) > - && (ifn == IFN_MASK_STORE || !allow_slp_p)) > + if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p) > { > mask1 = strip_conversion (mask1); > if (!mask1) > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index b5f9333fc22..cc799b6ebcd 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -503,6 +503,7 @@ static const int cond_expr_maps[3][5] = { > static const int arg1_map[] = { 1, 1 }; > static const int arg2_map[] = { 1, 2 }; > static const int arg1_arg4_map[] = { 2, 1, 4 }; > +static const int arg3_arg2_map[] = { 2, 3, 2 }; > static const int op1_op0_map[] = { 2, 1, 0 }; > > /* For most SLP statements, there is a one-to-one mapping between > @@ -543,6 +544,9 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > case IFN_MASK_GATHER_LOAD: > return arg1_arg4_map; > > + case IFN_MASK_STORE: > + return arg3_arg2_map; > + > default: > break; > } > @@ -550,6 +554,20 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > return nullptr; > } > > +/* Return the SLP node child index for operand OP of STMT. */ > + > +int > +vect_slp_child_index_for_operand (const gimple *stmt, int op) > +{ > + const int *opmap = vect_get_operand_map (stmt); > + if (!opmap) > + return op; > + for (int i = 1; i < 1 + opmap[0]; ++i) > + if (opmap[i] == op) > + return i - 1; > + gcc_unreachable (); > +} > + > /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that > they are of a valid type and that they match the defs of the first stmt of > the SLP group (stored in OPRNDS_INFO). This function tries to match stmts > @@ -1003,8 +1021,12 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > return false; > } > > + gcall *call_stmt = dyn_cast (stmt); > lhs = gimple_get_lhs (stmt); > - if (lhs == NULL_TREE) > + if (lhs == NULL_TREE > + && (!call_stmt > + || !gimple_call_internal_p (stmt) > + || !internal_store_fn_p (gimple_call_internal_fn (stmt)))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -1041,7 +1063,6 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > gcc_assert (vectype); > > - gcall *call_stmt = dyn_cast (stmt); > if (call_stmt) > { > combined_fn cfn = gimple_call_combined_fn (call_stmt); > @@ -1054,6 +1075,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > || cfn == CFN_GATHER_LOAD > || cfn == CFN_MASK_GATHER_LOAD) > load_p = true; > + else if (cfn == CFN_MASK_STORE) > + rhs_code = CFN_MASK_STORE; > else if ((internal_fn_p (cfn) > && !vectorizable_internal_fn_p (as_internal_fn (cfn))) > || gimple_call_tail_p (call_stmt) > @@ -1212,7 +1235,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > continue; > } > > - if (call_stmt && first_stmt_code != CFN_MASK_LOAD) > + if (call_stmt > + && first_stmt_code != CFN_MASK_LOAD > + && first_stmt_code != CFN_MASK_STORE) > { > if (!compatible_calls_p (as_a (stmts[0]->stmt), > call_stmt)) > @@ -1266,9 +1291,11 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > /* Grouped store or load. */ > if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > { > - if (REFERENCE_CLASS_P (lhs)) > + if (!load_p) > { > /* Store. */ > + gcc_assert (rhs_code == CFN_MASK_STORE > + || REFERENCE_CLASS_P (lhs)); > ; > } > else > @@ -9090,10 +9117,17 @@ vect_remove_slp_scalar_calls (vec_info *vinfo, > || !PURE_SLP_STMT (stmt_info)) > continue; > lhs = gimple_call_lhs (stmt); > - new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs))); > + if (lhs) > + new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs))); > + else > + { > + new_stmt = gimple_build_nop (); > + unlink_stmt_vdef (stmt_info->stmt); > + } > gsi = gsi_for_stmt (stmt); > vinfo->replace_stmt (&gsi, stmt_info, new_stmt); > - SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt; > + if (lhs) > + SSA_NAME_DEF_STMT (lhs) = new_stmt; > } > } > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 413a88750d6..31b73b08e62 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -2629,12 +2629,14 @@ vect_check_store_rhs (vec_info *vinfo, stmt_vec_info stmt_info, > return false; > } > > - unsigned op_no = 0; > + int op_no = 0; > if (gcall *call = dyn_cast (stmt_info->stmt)) > { > if (gimple_call_internal_p (call) > && internal_store_fn_p (gimple_call_internal_fn (call))) > op_no = internal_fn_stored_value_index (gimple_call_internal_fn (call)); > + if (slp_node) > + op_no = vect_slp_child_index_for_operand (call, op_no); > } > > enum vect_def_type rhs_dt; > @@ -8244,15 +8246,9 @@ vectorizable_store (vec_info *vinfo, > if (!internal_store_fn_p (ifn)) > return false; > > - if (slp_node != NULL) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "SLP of masked stores not supported.\n"); > - return false; > - } > - > int mask_index = internal_fn_mask_index (ifn); > + if (mask_index >= 0 && slp_node) > + mask_index = vect_slp_child_index_for_operand (call, mask_index); > if (mask_index >= 0 > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > &mask, NULL, &mask_dt, &mask_vectype)) > @@ -9093,8 +9089,10 @@ vectorizable_store (vec_info *vinfo, > { > /* Get vectorized arguments for SLP_NODE. */ > vect_get_vec_defs (vinfo, stmt_info, slp_node, 1, op, > - &vec_oprnds); > + &vec_oprnds, mask, &vec_masks); > vec_oprnd = vec_oprnds[0]; > + if (mask) > + vec_mask = vec_masks[0]; > } > else > { > @@ -9191,6 +9189,8 @@ vectorizable_store (vec_info *vinfo, > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > vec_num * ncopies, vectype, > vec_num * j + i); > + if (slp && vec_mask) > + vec_mask = vec_masks[i]; > if (vec_mask) > final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask, > vec_mask, gsi); > @@ -9575,9 +9575,8 @@ vectorizable_load (vec_info *vinfo, > return false; > > mask_index = internal_fn_mask_index (ifn); > - /* ??? For SLP the mask operand is always last. */ > if (mask_index >= 0 && slp_node) > - mask_index = SLP_TREE_CHILDREN (slp_node).length () - 1; > + mask_index = vect_slp_child_index_for_operand (call, mask_index); > if (mask_index >= 0 > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > &mask, NULL, &mask_dt, &mask_vectype)) > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 53a3d78d545..f1d0cd79961 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -2429,6 +2429,7 @@ extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info); > extern slp_tree vect_create_new_slp_node (unsigned, tree_code); > extern void vect_free_slp_tree (slp_tree); > extern bool compatible_calls_p (gcall *, gcall *); > +extern int vect_slp_child_index_for_operand (const gimple *, int op); > > /* In tree-vect-patterns.cc. */ > extern void