From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpbgjp3.qq.com (smtpbgjp3.qq.com [54.92.39.34]) by sourceware.org (Postfix) with ESMTPS id 756263858D32 for ; Mon, 24 Jul 2023 11:46:59 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 756263858D32 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=rivai.ai Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=rivai.ai X-QQ-mid: bizesmtp66t1690199212t3dmiqlu Received: from rios-cad5.localdomain ( [58.60.1.11]) by bizesmtp.qq.com (ESMTP) with id ; Mon, 24 Jul 2023 19:46:51 +0800 (CST) X-QQ-SSF: 01400000000000G0U000000A0000000 X-QQ-FEAT: ZkxZBO9qcf4FU218p/k5DpmRPmd2hF9uk/RUy9PDWXWs+2jgF0lx2usoSOGGV 6mggY3P4fZpuZ2/C0p0cTRLyQPA/2tBphaAm4SLDHE8awiplN6OZhaHvoD4yxLlz+wiccsM A2J7BRN5WQUaEUR3qJq97Y2wqkWoAN90ThA1zeniyCSmXWSN9oJtiEcn4NZwuPaEDqhNTN3 BUa7rHQ/lyh2heM/YyXxyuCDTrR05zan2l0vqa01Lt75E60KPlncd1nmcKazDCbxlZ6Vczx nTrOeAwClnueFSbVwmvBbs90hihBZyDQDoK5S9ASNM7vKKsPQwrquloSiEJaHyKb7tB2Ve1 Qmw4VIULztYNFke8PVJVm7pOI5dgIGiRD9BM2uNTmafWGlvl8gD5d8omJ4Flv2XtSg4Lau4 X-QQ-GoodBg: 2 X-BIZMAIL-ID: 7281631403808252231 From: juzhe.zhong@rivai.ai To: gcc-patches@gcc.gnu.org Cc: richard.sandiford@arm.com, rguenther@suse.de, Ju-Zhe Zhong Subject: [PATCH] VECT: Support CALL vectorization for COND_LEN_* Date: Mon, 24 Jul 2023 19:46:50 +0800 Message-Id: <20230724114650.61923-1-juzhe.zhong@rivai.ai> X-Mailer: git-send-email 2.36.3 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-QQ-SENDSIZE: 520 Feedback-ID: bizesmtp:rivai.ai:qybglogicsvrgz:qybglogicsvrgz7a-one-0 X-Spam-Status: No, score=-12.2 required=5.0 tests=BAYES_00,GIT_PATCH_0,KAM_DMARC_STATUS,RCVD_IN_DNSWL_NONE,RCVD_IN_MSPIKE_H3,RCVD_IN_MSPIKE_WL,SPF_HELO_PASS,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: From: Ju-Zhe Zhong Hi, Richard and Richi. This patch supports CALL vectorization by COND_LEN_*. Consider this following case: void foo (float * __restrict a, float * __restrict b, int * __restrict cond, int n) { for (int i = 0; i < n; i++) if (cond[i]) a[i] = b[i] + a[i]; } Output of RISC-V (32-bits) gcc (trunk) (Compiler #3) :5:21: missed: couldn't vectorize loop :5:21: missed: not vectorized: control flow in loop. ARM SVE: ... mask__27.10_51 = vect__4.9_49 != { 0, ... }; ... vec_mask_and_55 = loop_mask_49 & mask__27.10_51; ... vect__9.17_62 = .COND_ADD (vec_mask_and_55, vect__6.13_56, vect__8.16_60, vect__6.13_56); For RVV, we want IR as follows: ... _68 = .SELECT_VL (ivtmp_66, POLY_INT_CST [4, 4]); ... mask__27.10_51 = vect__4.9_49 != { 0, ... }; ... vect__9.17_60 = .COND_LEN_ADD (mask__27.10_51, vect__6.13_55, vect__8.16_59, vect__6.13_55, _68, 0); ... Both len and mask of COND_LEN_ADD are real not dummy. gcc/ChangeLog: * tree-if-conv.cc (ifcvt_can_predicate): Enable ifcvt for COND_LEN_*. * tree-vect-stmts.cc (vectorizable_internal_function): Apply COND_LEN_*. (vectorizable_call): Ditto. --- gcc/tree-if-conv.cc | 7 +++- gcc/tree-vect-stmts.cc | 84 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index 799f071965e..2e17658b9ed 100644 --- a/gcc/tree-if-conv.cc +++ b/gcc/tree-if-conv.cc @@ -1010,8 +1010,11 @@ ifcvt_can_predicate (gimple *stmt) if (!types_compatible_p (lhs_type, rhs_type)) return false; internal_fn cond_fn = get_conditional_internal_fn (code); - return (cond_fn != IFN_LAST - && vectorized_internal_fn_supported_p (cond_fn, lhs_type)); + internal_fn cond_len_fn = get_conditional_len_internal_fn (code); + return ((cond_fn != IFN_LAST + && vectorized_internal_fn_supported_p (cond_fn, lhs_type)) + || (cond_len_fn != IFN_LAST + && vectorized_internal_fn_supported_p (cond_len_fn, lhs_type))); } /* Return true when STMT is if-convertible. diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index ed28fbdced3..15401ddb682 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1543,10 +1543,27 @@ vectorizable_internal_function (combined_fn cfn, tree fndecl, tree vectype_out, tree vectype_in) { internal_fn ifn; + internal_fn len_ifn = IFN_LAST; if (internal_fn_p (cfn)) - ifn = as_internal_fn (cfn); + { + ifn = as_internal_fn (cfn); + tree_code code = conditional_internal_fn_code (ifn); + len_ifn = get_conditional_len_internal_fn (code); + } else ifn = associated_internal_fn (fndecl); + if (len_ifn != IFN_LAST && direct_internal_fn_p (len_ifn)) + { + const direct_internal_fn_info &info = direct_internal_fn (len_ifn); + if (info.vectorizable) + { + tree type0 = (info.type0 < 0 ? vectype_out : vectype_in); + tree type1 = (info.type1 < 0 ? vectype_out : vectype_in); + if (direct_internal_fn_supported_p (len_ifn, tree_pair (type0, type1), + OPTIMIZE_FOR_SPEED)) + return len_ifn; + } + } if (ifn != IFN_LAST && direct_internal_fn_p (ifn)) { const direct_internal_fn_info &info = direct_internal_fn (ifn); @@ -3538,8 +3555,10 @@ vectorizable_call (vec_info *vinfo, gcc_assert (ncopies >= 1); int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info); + int len_index = internal_fn_len_index (ifn); internal_fn cond_fn = get_conditional_internal_fn (ifn); vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); + vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL); if (!vec_stmt) /* transformation not required. */ { if (slp_node) @@ -3585,8 +3604,12 @@ vectorizable_call (vec_info *vinfo, tree scalar_mask = NULL_TREE; if (mask_opno >= 0) scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno); - vect_record_loop_mask (loop_vinfo, masks, nvectors, - vectype_out, scalar_mask); + if (len_index >= 0) + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_out, + 1); + else + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out, + scalar_mask); } } return true; @@ -3602,8 +3625,16 @@ vectorizable_call (vec_info *vinfo, vec_dest = vect_create_destination_var (scalar_dest, vectype_out); bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo); unsigned int vect_nargs = nargs; - if (masked_loop_p && reduc_idx >= 0) + if (len_index) + { + /* COND_ADD --> COND_LEN_ADD with 2 more args (LEN + BIAS). + FIXME: We don't support FMAX --> COND_LEN_FMAX yet which + needs 4 more args (MASK + ELSE + LEN + BIAS). */ + vect_nargs += 2; + } + else if (masked_loop_p && reduc_idx >= 0) { ifn = cond_fn; vect_nargs += 2; @@ -3670,7 +3701,29 @@ vectorizable_call (vec_info *vinfo, } else { - if (mask_opno >= 0 && masked_loop_p) + if (len_index) + { + tree len, bias; + if (len_loop_p) + len + = vect_get_loop_len (loop_vinfo, gsi, lens, + ncopies, vectype_out, j, 1); + else + { + /* Dummy LEN. */ + tree iv_type + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + len + = build_int_cst (iv_type, TYPE_VECTOR_SUBPARTS ( + vectype_out)); + } + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + vargs[varg++] = len; + vargs[varg++] = bias; + } + else if (mask_opno >= 0 && masked_loop_p) { unsigned int vec_num = vec_oprnds0.length (); /* Always true for SLP. */ @@ -3718,7 +3771,26 @@ vectorizable_call (vec_info *vinfo, if (masked_loop_p && reduc_idx >= 0) vargs[varg++] = vargs[reduc_idx + 1]; - if (mask_opno >= 0 && masked_loop_p) + if (len_index) + { + tree len, bias; + if (len_loop_p) + len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, + vectype_out, j, 1); + else + { + /* Dummy LEN. */ + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + len = build_int_cst (iv_type, + TYPE_VECTOR_SUBPARTS (vectype_out)); + } + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + vargs[varg++] = len; + vargs[varg++] = bias; + } + else if (mask_opno >= 0 && masked_loop_p) { tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype_out, j); -- 2.36.3