From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-lj1-x229.google.com (mail-lj1-x229.google.com [IPv6:2a00:1450:4864:20::229]) by sourceware.org (Postfix) with ESMTPS id 373703858015 for ; Thu, 11 Jan 2024 09:51:29 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 373703858015 Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 373703858015 Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=2a00:1450:4864:20::229 ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1704966691; cv=none; b=PrW1Y5yFIlfYHub5xhzWd3Y8Q/viVwwrkYnGaai7i7Xzj4HAHW/JhyQYsJ7xmVBH5rS/KfabAXpe/U+qHN6oUpxXWLZ9S2Vy1mXWz9egCluTgtDm7AgarMpn3aR2x3tT+2Mg9c48NIserTIavhuk3zOGQe/XT4qN0BB2xbu16dg= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1704966691; c=relaxed/simple; bh=w8rzBix2wsu+YKBNsedHkd51Zp4fdzBvNSOg8RB0GbA=; h=DKIM-Signature:MIME-Version:From:Date:Message-ID:Subject:To; b=HhE+VJmgFnmBlsRbVM/1d3FaEtnVepQx2+9Gy4B3Y1AcgN4sYBOpknUg4Owg8TLoMR28/88ZZYR5P64ltRMVeBVkPwiGOaBqNQaEE8KHmW9q/1VmKECCbaDT2Qm+Z68K+iZvhbtvabYAlgcAXigyjI2itZp5qcbmZRrrEkueE9U= ARC-Authentication-Results: i=1; server2.sourceware.org Received: by mail-lj1-x229.google.com with SMTP id 38308e7fff4ca-2cd46e7ae8fso56740391fa.1 for ; Thu, 11 Jan 2024 01:51:29 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1704966688; x=1705571488; darn=gcc.gnu.org; h=content-transfer-encoding:cc:to:subject:message-id:date:from :in-reply-to:references:mime-version:from:to:cc:subject:date :message-id:reply-to; bh=YqIE+Wu/M/vfJ7RTNG3a+gcMZuMiMxyDEgWsDGxQ+7k=; b=YxXnFyVWza0O+urdr45ESK6/CV3oJmW5u91dfI599PAf9H39NK3V8MM7cGJcBsMkL4 NBwzygTpJqYEyHedgYNIcxjitZOzfN+OAhuHr6KIkw5RwLLD++G0HEopsCNxjbhonlia 8E3jvtO3gXxz44eI6NMntSbjTp74Wd0ZaKbeFmPT6FZzJjNPRfw3rxyDV2Im3eNxMODg /po8J1ys/83XENvRtLyybtZLahn0NVev8EVEooTK/coTgK/hD/rtRZDzmIRmInmtb8Q0 OR7AfndBm2gbKC3nyb9GBOwPqPvAQaOb7/F2pfrA5bITwwe4JsgK49kBYXXqj8vHn5A9 OeHg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1704966688; x=1705571488; h=content-transfer-encoding:cc:to:subject:message-id:date:from :in-reply-to:references:mime-version:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=YqIE+Wu/M/vfJ7RTNG3a+gcMZuMiMxyDEgWsDGxQ+7k=; b=KtpXzzDcavN/vtp3t6uLTDNSAz7pWSD+tKhPM1dVvU9vOhB9hJkGmO+rI5gwsCbf1v ls0xrh6R0Q+aoostl11YoB1fS/nDh1Bqha9QS8YmgxRm4u/cG4lFirpFzXLnKJ+nNjjx 2bCSouimxul+r+p3uGDy9IEDSnTKFtUNBq9uOgJ417KQ3QhOguOrK0kgKEj9PVPH4FpT pA9UEsQ8thvpYmS3X0YBUCQNFuOdcXp47YH0q8oKT5UQHKiL6bjC7BT5E5XX5126+x5u iqYspSnP7ey0EkqQ7KRfu7g/J6vEIysIO3XXWP+tyigQzECHSQpC1bt4oscJsoiRPKn+ G+1Q== X-Gm-Message-State: AOJu0YyqcMXgqwgcewqOzMudPlRPbK8li37/yswWKepT/K+PmgDq7R+/ FfowO0ghn77j/jMFt3UI5N6L9JH63//FXXjm4FBRbO7Hgus= X-Google-Smtp-Source: AGHT+IFBW2MCJECGa2ZnlOPS6uj51uSI8AAKkLZQSaeGsJeSxEofQp3VJdrAhImFLZYv60qGY0RBIaS8cgayyRmZ4U0= X-Received: by 2002:a2e:9644:0:b0:2cd:2ac6:9684 with SMTP id z4-20020a2e9644000000b002cd2ac69684mr107481ljh.106.1704966687487; Thu, 11 Jan 2024 01:51:27 -0800 (PST) MIME-Version: 1.0 References: In-Reply-To: From: Richard Biener Date: Thu, 11 Jan 2024 10:46:13 +0100 Message-ID: Subject: Re: [PATCH] Do not count unused scalar use when marking STMT_VINFO_LIVE_P [PR113091] To: Feng Xue OS Cc: "gcc-patches@gcc.gnu.org" Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Spam-Status: No, score=-7.7 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,GIT_PATCH_0,KAM_SHORT,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Fri, Dec 29, 2023 at 11:29=E2=80=AFAM Feng Xue OS wrote: > > This patch is meant to fix over-estimation about SLP vector-to-scalar cos= t for > STMT_VINFO_LIVE_P statement. When pattern recognition is involved, a > statement whose definition is consumed in some pattern, may not be > included in the final replacement pattern statements, and would be skippe= d > when building SLP graph. > > * Original > char a_c =3D *(char *) a; > char b_c =3D *(char *) b; > unsigned short a_s =3D (unsigned short) a_c; > int a_i =3D (int) a_s; > int b_i =3D (int) b_c; > int r_i =3D a_i - b_i; > > * After pattern replacement > a_s =3D (unsigned short) a_c; > a_i =3D (int) a_s; > > patt_b_s =3D (unsigned short) b_c; // b_i =3D (int) b_c > patt_b_i =3D (int) patt_b_s; // b_i =3D (int) b_c > > patt_r_s =3D widen_minus(a_c, b_c); // r_i =3D a_i - b_i > patt_r_i =3D (int) patt_r_s; // r_i =3D a_i - b_i > > The definitions of a_i(original statement) and b_i(pattern statement) > are related to, but actually not part of widen_minus pattern. > Vectorizing the pattern does not cause these definition statements to > be marked as PURE_SLP. For this case, we need to recursively check > whether their uses are all absorbed into vectorized code. But there > is an exception that some use may participate in an vectorized > operation via an external SLP node containing that use as an element. > > Feng > > --- > .../gcc.target/aarch64/bb-slp-pr113091.c | 22 ++ > gcc/tree-vect-slp.cc | 189 ++++++++++++++---- > 2 files changed, 172 insertions(+), 39 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c > > diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c b/gcc/tes= tsuite/gcc.target/aarch64/bb-slp-pr113091.c > new file mode 100644 > index 00000000000..ff822e90b4a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -fdump-tree-slp-details -ftree-slp-vecto= rize" } */ > + > +int test(unsigned array[8]); > + > +int foo(char *a, char *b) > +{ > + unsigned array[8]; > + > + array[0] =3D (a[0] - b[0]); > + array[1] =3D (a[1] - b[1]); > + array[2] =3D (a[2] - b[2]); > + array[3] =3D (a[3] - b[3]); > + array[4] =3D (a[4] - b[4]); > + array[5] =3D (a[5] - b[5]); > + array[6] =3D (a[6] - b[6]); > + array[7] =3D (a[7] - b[7]); > + > + return test(array); > +} > + > +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized usi= ng SLP" 1 "slp2" } } */ > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index a82fca45161..d36ff37114e 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -6418,6 +6418,84 @@ vect_slp_analyze_node_operations (vec_info *vinfo,= slp_tree node, > return res; > } > > +/* Given a definition DEF, analyze if it will have any live scalar use a= fter > + performing SLP vectorization whose information is represented by BB_V= INFO, > + and record result into hash map SCALAR_USE_MAP as cache for later fas= t > + check. */ > + > +static bool > +vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def, > + hash_map &scalar_use_map) > +{ > + imm_use_iterator use_iter; > + gimple *use_stmt; > + > + if (bool *res =3D scalar_use_map.get (def)) > + return *res; > + > + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def) > + { > + if (is_gimple_debug (use_stmt)) > + continue; > + > + stmt_vec_info use_stmt_info =3D bb_vinfo->lookup_stmt (use_stmt); > + > + if (!use_stmt_info) > + break; > + > + if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))) > + continue; > + > + /* Do not step forward when encounter PHI statement, since it may > + involve cyclic reference and cause infinite recursive invocation= . */ > + if (gimple_code (use_stmt) =3D=3D GIMPLE_PHI) > + break; > + > + /* When pattern recognition is involved, a statement whose definit= ion is > + consumed in some pattern, may not be included in the final repla= cement > + pattern statements, so would be skipped when building SLP graph. > + > + * Original > + char a_c =3D *(char *) a; > + char b_c =3D *(char *) b; > + unsigned short a_s =3D (unsigned short) a_c; > + int a_i =3D (int) a_s; > + int b_i =3D (int) b_c; > + int r_i =3D a_i - b_i; > + > + * After pattern replacement > + a_s =3D (unsigned short) a_c; > + a_i =3D (int) a_s; > + > + patt_b_s =3D (unsigned short) b_c; // b_i =3D (int) b_c > + patt_b_i =3D (int) patt_b_s; // b_i =3D (int) b_c > + > + patt_r_s =3D widen_minus(a_c, b_c); // r_i =3D a_i - b_i > + patt_r_i =3D (int) patt_r_s; // r_i =3D a_i - b_i > + > + The definitions of a_i(original statement) and b_i(pattern state= ment) > + are related to, but actually not part of widen_minus pattern. > + Vectorizing the pattern does not cause these definition statemen= ts to > + be marked as PURE_SLP. For this case, we need to recursively ch= eck > + whether their uses are all absorbed into vectorized code. But t= here > + is an exception that some use may participate in an vectorized > + operation via an external SLP node containing that use as an ele= ment. > + The parameter "scalar_use_map" tags such kind of SSA as having s= calar > + use in advance. */ > + tree lhs =3D gimple_get_lhs (use_stmt); > + > + if (!lhs || TREE_CODE (lhs) !=3D SSA_NAME > + || vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map)) This looks mostly good, the only worry I have is that this recursion will - for unvectorized uses - eventually go all the way down to the end of the vectorized region (aka the whole function). Since it's really patterns we are after it should be possible to limit the recursion to use_stmt which are covered by a pattern, aka STMT_VINFO_IN_PATTERN_P? PHIs should be then magically covered since we have no patterns for them (though explicitly handling is good for documentation purposes). Thanks, Richard. > + break; > + } > + > + bool found =3D !end_imm_use_stmt_p (&use_iter); > + bool added =3D scalar_use_map.put (def, found); > + > + gcc_assert (!added); > + return found; > +} > + > /* Mark lanes of NODE that are live outside of the basic-block vectorize= d > region and that can be vectorized using vectorizable_live_operation > with STMT_VINFO_LIVE_P. Not handled live operations will cause the > @@ -6427,6 +6505,7 @@ static void > vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node, > slp_instance instance, > stmt_vector_for_cost *cost_vec, > + hash_map &scalar_use_map, > hash_set &svisited, > hash_set &visited) > { > @@ -6451,32 +6530,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo= , slp_tree node, > def_operand_p def_p; > FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF) > { > - imm_use_iterator use_iter; > - gimple *use_stmt; > - stmt_vec_info use_stmt_info; > - FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)= ) > - if (!is_gimple_debug (use_stmt)) > - { > - use_stmt_info =3D bb_vinfo->lookup_stmt (use_stmt); > - if (!use_stmt_info > - || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_i= nfo))) > - { > - STMT_VINFO_LIVE_P (stmt_info) =3D true; > - if (vectorizable_live_operation (bb_vinfo, stmt_info, > - node, instance, i, > - false, cost_vec)) > - /* ??? So we know we can vectorize the live stmt > - from one SLP node. If we cannot do so from all > - or none consistently we'd have to record which > - SLP node (and lane) we want to use for the live > - operation. So make sure we can code-generate > - from all nodes. */ > - mark_visited =3D false; > - else > - STMT_VINFO_LIVE_P (stmt_info) =3D false; > - break; > - } > - } > + if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p), > + scalar_use_map)) > + { > + STMT_VINFO_LIVE_P (stmt_info) =3D true; > + if (vectorizable_live_operation (bb_vinfo, stmt_info, node, > + instance, i, false, cost_v= ec)) > + /* ??? So we know we can vectorize the live stmt from on= e SLP > + node. If we cannot do so from all or none consistentl= y > + we'd have to record which SLP node (and lane) we want = to > + use for the live operation. So make sure we can > + code-generate from all nodes. */ > + mark_visited =3D false; > + else > + STMT_VINFO_LIVE_P (stmt_info) =3D false; > + } > + > /* We have to verify whether we can insert the lane extract > before all uses. The following is a conservative approximat= ion. > We cannot put this into vectorizable_live_operation because > @@ -6495,6 +6564,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo,= slp_tree node, > from the latest stmt in a node. So we compensate for this > during code-generation, simply not replacing uses for those > hopefully rare cases. */ > + imm_use_iterator use_iter; > + gimple *use_stmt; > + stmt_vec_info use_stmt_info; > + > if (STMT_VINFO_LIVE_P (stmt_info)) > FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_= p)) > if (!is_gimple_debug (use_stmt) > @@ -6517,8 +6590,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo,= slp_tree node, > slp_tree child; > FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) > if (child && SLP_TREE_DEF_TYPE (child) =3D=3D vect_internal_def) > - vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, > - cost_vec, svisited, visited); > + vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec, > + scalar_use_map, svisited, visited); > +} > + > +/* Traverse all slp instances of BB_VINFO, and mark lanes of every node = that > + are live outside of the basic-block vectorized region and that can be > + vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. = */ > + > +static void > +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo) > +{ > + if (bb_vinfo->slp_instances.is_empty ()) > + return; > + > + hash_set svisited; > + hash_set visited; > + hash_map scalar_use_map; > + auto_vec worklist; > + > + for (slp_instance instance : bb_vinfo->slp_instances) > + if (!visited.add (SLP_INSTANCE_TREE (instance))) > + worklist.safe_push (SLP_INSTANCE_TREE (instance)); > + > + do > + { > + slp_tree node =3D worklist.pop (); > + > + if (SLP_TREE_DEF_TYPE (node) =3D=3D vect_external_def) > + { > + for (tree op : SLP_TREE_SCALAR_OPS (node)) > + if (TREE_CODE (op) =3D=3D SSA_NAME) > + scalar_use_map.put (op, true); > + } > + else > + { > + for (slp_tree child : SLP_TREE_CHILDREN (node)) > + if (child && !visited.add (child)) > + worklist.safe_push (child); > + } > + } while (!worklist.is_empty ()); > + > + visited.empty (); > + > + for (slp_instance instance : bb_vinfo->slp_instances) > + { > + vect_location =3D instance->location (); > + vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance= ), > + instance, &instance->cost_vec, > + scalar_use_map, svisited, visited); > + } > } > > /* Determine whether we can vectorize the reduction epilogue for INSTANC= E. */ > @@ -6684,17 +6805,7 @@ vect_slp_analyze_operations (vec_info *vinfo) > > /* Compute vectorizable live stmts. */ > if (bb_vec_info bb_vinfo =3D dyn_cast (vinfo)) > - { > - hash_set svisited; > - hash_set visited; > - for (i =3D 0; vinfo->slp_instances.iterate (i, &instance); ++i) > - { > - vect_location =3D instance->location (); > - vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (insta= nce), > - instance, &instance->cost_vec, svi= sited, > - visited); > - } > - } > + vect_bb_slp_mark_live_stmts (bb_vinfo); > > return !vinfo->slp_instances.is_empty (); > } > -- > 2.17.1