public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: "Kewen.Lin" <linkw@linux.ibm.com>
To: richard.sandiford@arm.com
Cc: Segher Boessenkool <segher@kernel.crashing.org>,
	Bill Schmidt <wschmidt@linux.ibm.com>,
	"Kewen.Lin via Gcc-patches" <gcc-patches@gcc.gnu.org>,
	Richard Biener <richard.guenther@gmail.com>
Subject: [PATCH v2] predcom: Enabled by loop vect at O2 [PR100794]
Date: Thu, 3 Jun 2021 11:33:06 +0800	[thread overview]
Message-ID: <4a817aa5-417e-fafb-9c28-52f93aa53082@linux.ibm.com> (raw)
In-Reply-To: <mptwnrcw5ds.fsf@arm.com>

[-- Attachment #1: Type: text/plain, Size: 7889 bytes --]

Hi Richard,

on 2021/6/3 上午1:19, Richard Sandiford wrote:
> "Kewen.Lin via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> Hi,
>>
>> As PR100794 shows, in the current implementation PRE bypasses
>> some optimization to avoid introducing loop carried dependence
>> which stops loop vectorizer to vectorize the loop.  At -O2,
>> there is no downstream pass to re-catch this kind of opportunity
>> if loop vectorizer fails to vectorize that loop.
>>
>> This patch follows Richi's suggestion in the PR, if predcom flag
>> isn't set and loop vectorization will enable predcom without any
>> unrolling implicitly.  The Power9 SPEC2017 evaluation showed it
>> can speed up 521.wrf_r 3.30% and 554.roms_r 1.08% at very-cheap
>> cost model, no remarkable impact at cheap cost model, the build
>> time and size impact is fine (see the PR for the details).
>>
>> By the way, I tested another proposal to guard PRE not skip the
>> optimization for cheap and very-cheap vect cost models, the
>> evaluation results showed it's fine with very cheap cost model,
>> but it can degrade some bmks like 521.wrf_r -9.17% and
>> 549.fotonik3d_r -2.07% etc.
>>
>> Bootstrapped/regtested on powerpc64le-linux-gnu P9,
>> x86_64-redhat-linux and aarch64-linux-gnu.
>>
>> Is it ok for trunk?
>>
>> BR,
>> Kewen
>> -----
>> gcc/ChangeLog:
>>
>> 	PR tree-optimization/100794
>> 	* tree-predcom.c (tree_predictive_commoning_loop): Add parameter
>> 	allow_unroll_p and only allow unrolling when it's true.
>> 	(tree_predictive_commoning): Add parameter allow_unroll_p and
>> 	adjust for it.
>> 	(run_tree_predictive_commoning): Likewise.
>> 	(class pass_predcom): Add private member allow_unroll_p.
>> 	(pass_predcom::pass_predcom): Init allow_unroll_p.
>> 	(pass_predcom::gate): Check flag_tree_loop_vectorize and 
>> 	global_options_set.x_flag_predictive_commoning.
>> 	(pass_predcom::execute): Adjust for allow_unroll_p.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	PR tree-optimization/100794
>> 	* gcc.dg/tree-ssa/pr100794.c: New test.
>>
>>  gcc/testsuite/gcc.dg/tree-ssa/pr100794.c | 20 +++++++++
>>  gcc/tree-predcom.c                       | 57 +++++++++++++++++-------
>>  2 files changed, 60 insertions(+), 17 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr100794.c
>>
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c b/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c
>> new file mode 100644
>> index 00000000000..6f707ae7fba
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c
>> @@ -0,0 +1,20 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -ftree-loop-vectorize -fdump-tree-pcom-details -fdisable-tree-vect" } */
>> +
>> +extern double arr[100];
>> +extern double foo (double, double);
>> +extern double sum;
>> +
>> +void
>> +test (int i_0, int i_n)
>> +{
>> +  int i;
>> +  for (i = i_0; i < i_n - 1; i++)
>> +    {
>> +      double a = arr[i];
>> +      double b = arr[i + 1];
>> +      sum += a * b;
>> +    }
>> +}
>> +
>> +/* { dg-final { scan-tree-dump "Executing predictive commoning without unrolling" "pcom" } } */
>> diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c
>> index 02f911a08bb..65a93c8e505 100644
>> --- a/gcc/tree-predcom.c
>> +++ b/gcc/tree-predcom.c
>> @@ -3178,13 +3178,13 @@ insert_init_seqs (class loop *loop, vec<chain_p> chains)
>>     applied to this loop.  */
>>  
>>  static unsigned
>> -tree_predictive_commoning_loop (class loop *loop)
>> +tree_predictive_commoning_loop (class loop *loop, bool allow_unroll_p)
>>  {
>>    vec<data_reference_p> datarefs;
>>    vec<ddr_p> dependences;
>>    struct component *components;
>>    vec<chain_p> chains = vNULL;
>> -  unsigned unroll_factor;
>> +  unsigned unroll_factor = 0;
>>    class tree_niter_desc desc;
>>    bool unroll = false, loop_closed_ssa = false;
>>  
>> @@ -3272,11 +3272,13 @@ tree_predictive_commoning_loop (class loop *loop)
>>        dump_chains (dump_file, chains);
>>      }
>>  
>> -  /* Determine the unroll factor, and if the loop should be unrolled, ensure
>> -     that its number of iterations is divisible by the factor.  */
>> -  unroll_factor = determine_unroll_factor (chains);
>> -  unroll = (unroll_factor > 1
>> -	    && can_unroll_loop_p (loop, unroll_factor, &desc));
>> +  if (allow_unroll_p)
>> +    /* Determine the unroll factor, and if the loop should be unrolled, ensure
>> +       that its number of iterations is divisible by the factor.  */
>> +    unroll_factor = determine_unroll_factor (chains);
>> +
>> +  if (unroll_factor > 1)
>> +    unroll = can_unroll_loop_p (loop, unroll_factor, &desc);
>>  
>>    /* Execute the predictive commoning transformations, and possibly unroll the
>>       loop.  */
>> @@ -3319,7 +3321,7 @@ tree_predictive_commoning_loop (class loop *loop)
>>  /* Runs predictive commoning.  */
>>  
>>  unsigned
>> -tree_predictive_commoning (void)
>> +tree_predictive_commoning (bool allow_unroll_p)
>>  {
>>    class loop *loop;
>>    unsigned ret = 0, changed = 0;
>> @@ -3328,7 +3330,7 @@ tree_predictive_commoning (void)
>>    FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
>>      if (optimize_loop_for_speed_p (loop))
>>        {
>> -	changed |= tree_predictive_commoning_loop (loop);
>> +	changed |= tree_predictive_commoning_loop (loop, allow_unroll_p);
>>        }
>>    free_original_copy_tables ();
>>  
>> @@ -3355,12 +3357,12 @@ tree_predictive_commoning (void)
>>  /* Predictive commoning Pass.  */
>>  
>>  static unsigned
>> -run_tree_predictive_commoning (struct function *fun)
>> +run_tree_predictive_commoning (struct function *fun, bool allow_unroll_p)
>>  {
>>    if (number_of_loops (fun) <= 1)
>>      return 0;
>>  
>> -  return tree_predictive_commoning ();
>> +  return tree_predictive_commoning (allow_unroll_p);
>>  }
>>  
>>  namespace {
>> @@ -3382,15 +3384,36 @@ class pass_predcom : public gimple_opt_pass
>>  {
>>  public:
>>    pass_predcom (gcc::context *ctxt)
>> -    : gimple_opt_pass (pass_data_predcom, ctxt)
>> +    : gimple_opt_pass (pass_data_predcom, ctxt),
>> +      allow_unroll_p (true)
>>    {}
>>  
>>    /* opt_pass methods: */
>> -  virtual bool gate (function *) { return flag_predictive_commoning != 0; }
>> -  virtual unsigned int execute (function *fun)
>> -    {
>> -      return run_tree_predictive_commoning (fun);
>> -    }
>> +  virtual bool
>> +  gate (function *)
>> +  {
>> +    if (flag_predictive_commoning != 0)
>> +      return true;
>> +    /* Loop vectorization enables predictive commoning implicitly
>> +       only if predictive commoning isn't set explicitly, and it
>> +       doesn't allow unrolling.  */
>> +    if (flag_tree_loop_vectorize
>> +	&& !global_options_set.x_flag_predictive_commoning)
>> +      {
>> +	allow_unroll_p = false;
>> +	return true;
>> +      }
>> +    return false;
>> +  }
>> +
>> +  virtual unsigned int
>> +  execute (function *fun)
>> +  {
>> +    return run_tree_predictive_commoning (fun, allow_unroll_p);
>> +  }
>> +
>> +private:
>> +  bool allow_unroll_p;
>>  
>>  }; // class pass_predcom
> 
> Calculating allow_unroll_p this way doesn't look robust against
> changes in options caused by pragmas, etc.  Would it work if we
> dropped the member variable and just passed flag_predictive_commoning != 0
> to run_tree_predictive commoning?
> 

Thanks for the comments!  Yeah, it would work well.  The updated version
of patch has been attached.

BR,
Kewen
-----
gcc/ChangeLog:

	PR tree-optimization/100794
	* tree-predcom.c (tree_predictive_commoning_loop): Add parameter
	allow_unroll_p and only allow unrolling when it's true.
	(tree_predictive_commoning): Add parameter allow_unroll_p and
	adjust for it.
	(run_tree_predictive_commoning): Likewise.
	(pass_predcom::gate): Check flag_tree_loop_vectorize and 
	global_options_set.x_flag_predictive_commoning.
	(pass_predcom::execute): Adjust for allow_unroll_p.

gcc/testsuite/ChangeLog:

	PR tree-optimization/100794
	* gcc.dg/tree-ssa/pr100794.c: New test.

[-- Attachment #2: predcom_enabled_v2.diff --]
[-- Type: text/plain, Size: 4058 bytes --]

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c b/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c
new file mode 100644
index 00000000000..6f707ae7fba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-vectorize -fdump-tree-pcom-details -fdisable-tree-vect" } */
+
+extern double arr[100];
+extern double foo (double, double);
+extern double sum;
+
+void
+test (int i_0, int i_n)
+{
+  int i;
+  for (i = i_0; i < i_n - 1; i++)
+    {
+      double a = arr[i];
+      double b = arr[i + 1];
+      sum += a * b;
+    }
+}
+
+/* { dg-final { scan-tree-dump "Executing predictive commoning without unrolling" "pcom" } } */
diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c
index 02f911a08bb..ac1674d5486 100644
--- a/gcc/tree-predcom.c
+++ b/gcc/tree-predcom.c
@@ -3178,13 +3178,13 @@ insert_init_seqs (class loop *loop, vec<chain_p> chains)
    applied to this loop.  */
 
 static unsigned
-tree_predictive_commoning_loop (class loop *loop)
+tree_predictive_commoning_loop (class loop *loop, bool allow_unroll_p)
 {
   vec<data_reference_p> datarefs;
   vec<ddr_p> dependences;
   struct component *components;
   vec<chain_p> chains = vNULL;
-  unsigned unroll_factor;
+  unsigned unroll_factor = 0;
   class tree_niter_desc desc;
   bool unroll = false, loop_closed_ssa = false;
 
@@ -3272,11 +3272,13 @@ tree_predictive_commoning_loop (class loop *loop)
       dump_chains (dump_file, chains);
     }
 
-  /* Determine the unroll factor, and if the loop should be unrolled, ensure
-     that its number of iterations is divisible by the factor.  */
-  unroll_factor = determine_unroll_factor (chains);
-  unroll = (unroll_factor > 1
-	    && can_unroll_loop_p (loop, unroll_factor, &desc));
+  if (allow_unroll_p)
+    /* Determine the unroll factor, and if the loop should be unrolled, ensure
+       that its number of iterations is divisible by the factor.  */
+    unroll_factor = determine_unroll_factor (chains);
+
+  if (unroll_factor > 1)
+    unroll = can_unroll_loop_p (loop, unroll_factor, &desc);
 
   /* Execute the predictive commoning transformations, and possibly unroll the
      loop.  */
@@ -3319,7 +3321,7 @@ tree_predictive_commoning_loop (class loop *loop)
 /* Runs predictive commoning.  */
 
 unsigned
-tree_predictive_commoning (void)
+tree_predictive_commoning (bool allow_unroll_p)
 {
   class loop *loop;
   unsigned ret = 0, changed = 0;
@@ -3328,7 +3330,7 @@ tree_predictive_commoning (void)
   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
     if (optimize_loop_for_speed_p (loop))
       {
-	changed |= tree_predictive_commoning_loop (loop);
+	changed |= tree_predictive_commoning_loop (loop, allow_unroll_p);
       }
   free_original_copy_tables ();
 
@@ -3355,12 +3357,12 @@ tree_predictive_commoning (void)
 /* Predictive commoning Pass.  */
 
 static unsigned
-run_tree_predictive_commoning (struct function *fun)
+run_tree_predictive_commoning (struct function *fun, bool allow_unroll_p)
 {
   if (number_of_loops (fun) <= 1)
     return 0;
 
-  return tree_predictive_commoning ();
+  return tree_predictive_commoning (allow_unroll_p);
 }
 
 namespace {
@@ -3386,11 +3388,27 @@ public:
   {}
 
   /* opt_pass methods: */
-  virtual bool gate (function *) { return flag_predictive_commoning != 0; }
-  virtual unsigned int execute (function *fun)
-    {
-      return run_tree_predictive_commoning (fun);
-    }
+  virtual bool
+  gate (function *)
+  {
+    if (flag_predictive_commoning != 0)
+      return true;
+    /* Loop vectorization enables predictive commoning implicitly
+       only if predictive commoning isn't set explicitly, and it
+       doesn't allow unrolling.  */
+    if (flag_tree_loop_vectorize
+	&& !global_options_set.x_flag_predictive_commoning)
+      return true;
+
+    return false;
+  }
+
+  virtual unsigned int
+  execute (function *fun)
+  {
+    bool allow_unroll_p = flag_predictive_commoning != 0;
+    return run_tree_predictive_commoning (fun, allow_unroll_p);
+  }
 
 }; // class pass_predcom
 

  reply	other threads:[~2021-06-03  3:33 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-02  9:34 [PATCH] " Kewen.Lin
2021-06-02 17:19 ` Richard Sandiford
2021-06-03  3:33   ` Kewen.Lin [this message]
2021-06-07 14:51     ` [PATCH v2] " Richard Biener

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4a817aa5-417e-fafb-9c28-52f93aa53082@linux.ibm.com \
    --to=linkw@linux.ibm.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=richard.guenther@gmail.com \
    --cc=richard.sandiford@arm.com \
    --cc=segher@kernel.crashing.org \
    --cc=wschmidt@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).