* [PATCH] PR83017, parloops part
@ 2017-11-17 13:19 Richard Biener
0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2017-11-17 13:19 UTC (permalink / raw)
To: gcc-patches
This makes the minimum number of iterations per thread a --param instead
of a magic define and handles loop->can_be_parallel independent of
whether flag_loop_parallelize_all was enabled (and thus also handle
loops our own dependence analysis can analyze but graphites could not).
It also adjusts the costing model for # latch executions vs. stmt
executions and makes the static profitability check match the dynamic one.
Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
Richard.
2017-11-17 Richard Biener <rguenther@suse.de>
PR tree-optimization/83017
* tree-parloops.c (MIN_PER_THREAD): Use --param parloops-min-per-thread.
(gen_parallel_loop): Properly count iterations.
(parallelize_loops): Handle loop->can_be_parallel independent
of flag_loop_parallelize_all. Make static profitability test match
the runtime one.
* params.def (PARAM_PARLOOPS_MIN_PER_THREAD): New.
* invoke.texi (parloops-min-per-thread): Document.
* gcc.dg/autopar/pr49960.c: Adjust.
Index: gcc/tree-parloops.c
===================================================================
--- gcc/tree-parloops.c (revision 254858)
+++ gcc/tree-parloops.c (working copy)
@@ -184,7 +184,7 @@ parloop
/* Minimal number of iterations of a loop that should be executed in each
thread. */
-#define MIN_PER_THREAD 100
+#define MIN_PER_THREAD PARAM_VALUE (PARAM_PARLOOPS_MIN_PER_THREAD)
/* Element of the hashtable, representing a
reduction in the current loop. */
@@ -2336,7 +2336,7 @@ gen_parallel_loop (struct loop *loop,
gcc_checking_assert (n_threads != 0);
many_iterations_cond =
fold_build2 (GE_EXPR, boolean_type_node,
- nit, build_int_cst (type, m_p_thread * n_threads));
+ nit, build_int_cst (type, m_p_thread * n_threads - 1));
many_iterations_cond
= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@@ -3299,15 +3299,6 @@ parallelize_loops (bool oacc_kernels_p)
fprintf (dump_file, "loop %d is innermost\n",loop->num);
}
- /* If we use autopar in graphite pass, we use its marked dependency
- checking results. */
- if (flag_loop_parallelize_all && !loop->can_be_parallel)
- {
- if (dump_file && (dump_flags & TDF_DETAILS))
- fprintf (dump_file, "loop is not parallel according to graphite\n");
- continue;
- }
-
if (!single_dom_exit (loop))
{
@@ -3325,15 +3316,17 @@ parallelize_loops (bool oacc_kernels_p)
|| loop_has_vector_phi_nodes (loop))
continue;
- estimated = estimated_stmt_executions_int (loop);
+ estimated = estimated_loop_iterations_int (loop);
if (estimated == -1)
- estimated = likely_max_stmt_executions_int (loop);
+ estimated = get_likely_max_loop_iterations_int (loop);
/* FIXME: Bypass this check as graphite doesn't update the
count and frequency correctly now. */
if (!flag_loop_parallelize_all
&& !oacc_kernels_p
&& ((estimated != -1
- && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
+ && (estimated
+ < ((HOST_WIDE_INT) n_threads
+ * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
/* Do not bother with loops in cold areas. */
|| optimize_loop_nest_for_size_p (loop)))
continue;
@@ -3347,7 +3340,7 @@ parallelize_loops (bool oacc_kernels_p)
if (loop_has_phi_with_address_arg (loop))
continue;
- if (!flag_loop_parallelize_all
+ if (!loop->can_be_parallel
&& !loop_parallel_p (loop, &parloop_obstack))
continue;
Index: gcc/params.def
===================================================================
--- gcc/params.def (revision 254858)
+++ gcc/params.def (working copy)
@@ -1240,6 +1240,12 @@ DEFPARAMENUM5 (PARAM_PARLOOPS_SCHEDULE,
static,
static, dynamic, guided, auto, runtime)
+DEFPARAM (PARAM_PARLOOPS_MIN_PER_THREAD,
+ "parloops-min-per-thread",
+ "Minimum number of iterations per thread of an innermost "
+ "parallelized loop.",
+ 100, 2, 0)
+
DEFPARAM (PARAM_MAX_SSA_NAME_QUERY_DEPTH,
"max-ssa-name-query-depth",
"Maximum recursion depth allowed when querying a property of an"
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi (revision 254858)
+++ gcc/doc/invoke.texi (working copy)
@@ -10816,6 +10816,12 @@ is 0.
Schedule type of omp schedule for loops parallelized by parloops (static,
dynamic, guided, auto, runtime). The default is static.
+@item parloops-min-per-thread
+The minimum number of iterations per thread of an innermost parallelized
+loop for which the parallelized variant is prefered over the single threaded
+one. The default is 100. Note that for a parallelized loop nest the
+minimum number of iterations of the outermost loop per thread is two.
+
@item max-ssa-name-query-depth
Maximum depth of recursion when querying properties of SSA names in things
like fold routines. One level of recursion corresponds to following a
Index: gcc/testsuite/gcc.dg/autopar/pr49960.c
===================================================================
--- gcc/testsuite/gcc.dg/autopar/pr49960.c (revision 254857)
+++ gcc/testsuite/gcc.dg/autopar/pr49960.c (working copy)
@@ -7,7 +7,8 @@
#define MA 400
int T[MA][MB],A[MA][NA],B[MB][NA];
-void MRTRBR(int MA_1, int NA_1, int MB_1)
+void __attribute__((noinline))
+MRTRBR(int MA_1, int NA_1, int MB_1)
{
int i,j, t,k;
@@ -21,7 +22,7 @@ void MRTRBR(int MA_1, int NA_1, int MB_1
/* The outer most loop is not parallel because for different k's there
is write-write dependency for T[i][j]. */
- /* The two inner loops don't get parallelized due to low number of
+ /* The innermost loop doesn't get parallelized due to low number of
iterations. */
for (k = 3; k < NA_1; k++)
@@ -38,7 +39,10 @@ void main ()
for (i = 3; i < MA; i++)
for (j = 3; j < MB; j++)
- T[i][j] = (i>j?i:j);
+ {
+ __asm__ volatile ("" : : : "memory");
+ T[i][j] = (i>j?i:j);
+ }
MRTRBR (MA,NA,MB);
@@ -48,7 +52,7 @@ void main ()
}
-/* Check that the outer most loop doesn't get parallelized (thus no loop gets parallelized) */
+/* Check that the outer most loop doesn't get parallelized. */
-/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 0 "parloops2" } } */
-/* { dg-final { scan-tree-dump-times "loopfn" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops2" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_GOMP_parallel" 1 "optimized" } } */
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2017-11-17 13:13 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-17 13:19 [PATCH] PR83017, parloops part Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).