* [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and handle OMP_WAIT_POLICY
@ 2008-03-19 13:47 Jakub Jelinek
2008-03-19 23:40 ` Liaskovitis, Vasileios
0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2008-03-19 13:47 UTC (permalink / raw)
To: gcc-patches; +Cc: Ulrich Drepper
[-- Attachment #1: Type: text/plain, Size: 11787 bytes --]
Hi!
I've been playing with runtime detection of the number of
do_wait spins per millisecond (see attached proglet), unfortunately
on some architectures (e.g. ppc G5) it is completely unreliable, on others,
including x86_64, it is from time to time 10 times off. But estimating
the spin count so that it is at most 10 times off at least on most CPUs
doesn't need any runtime detection, especially given that CPU frequencies
hit the ceiling.
So, instead of letting users specify GOMP_BLOCKCOUNT as time in milliseconds
to spin, this patch lets users specify GOMP_SPINCOUNT as number of spins.
If this isn't specified, the default is 30g for OMP_WAIT_POLICY=active
(30g is roughly 5 minutes, could be 2 or 15 minutes depending on hw)
20m for no OMP_WAIT_POLICY (roughly 0.2 seconds) and 0 for
OMP_WAIT_POLICY=passive. Additionally, the patch tracks number of threads
currently managed by libgomp and if there are more libgomp managed threads
than available CPUs, the spin counts decrease radically (1k for
OMP_WAIT_POLICY=active, 100 for no OMP_WAIT_POLICY).
The busy waiting duration is a hint anyway, so being not very precise is
IMHO not a big deal, but e.g. doing clock_gettime every few iterations would
increase a latency a lot.
2008-03-19 Jakub Jelinek <jakub@redhat.com>
* libgomp.h (gomp_active_wait_policy): Remove decl.
(gomp_throttled_spin_count_var, gomp_available_cpus,
gomp_managed_threads): New extern decls.
* team.c (gomp_team_start, gomp_team_end): If number of threads
changed, adjust atomically gomp_managed_threads.
* env.c (gomp_active_wait_policy, gomp_block_time_var): Remove.
(gomp_throttled_spin_count_var, gomp_available_cpus,
gomp_managed_threads): New variables.
(parse_millis): Removed.
(parse_spincount): New function.
(parse_wait_policy): Return -1/0/1 instead of setting
gomp_active_wait_policy.
(initialize_env): Call gomp_init_num_threads unconditionally.
Initialize gomp_available_cpus. Call parse_spincount instead
of parse_millis, initialize gomp_{,throttled_}spin_count_var
depending on presence and value of OMP_WAIT_POLICY and
GOMP_SPINCOUNT env vars.
* config/linux/wait.h (do_wait): Use gomp_throttled_spin_count_var
instead of gomp_spin_count_var if gomp_managed_threads >
gomp_available_cpus.
--- libgomp/team.c (revision 133292)
+++ libgomp/team.c (working copy)
@@ -287,8 +287,24 @@ gomp_team_start (void (*fn) (void *), vo
}
}
+ if (__builtin_expect (nthreads > old_threads_used, 0))
+ {
+ long diff = (long) nthreads - (long) old_threads_used;
+
+ if (old_threads_used == 0)
+ --diff;
+
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_fetch_and_add (&gomp_managed_threads, diff);
+#else
+ gomp_mutex_lock (&gomp_remaining_threads_lock);
+ gomp_managed_threads += diff;
+ gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+ }
+
attr = &gomp_thread_attr;
- if (gomp_cpu_affinity != NULL)
+ if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
{
size_t stacksize;
pthread_attr_init (&thread_attr);
@@ -328,7 +344,7 @@ gomp_team_start (void (*fn) (void *), vo
gomp_fatal ("Thread creation failed: %s", strerror (err));
}
- if (gomp_cpu_affinity != NULL)
+ if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
pthread_attr_destroy (&thread_attr);
do_release:
@@ -338,8 +354,20 @@ gomp_team_start (void (*fn) (void *), vo
that should arrive back at the end of this team. The extra
threads should be exiting. Note that we arrange for this test
to never be true for nested teams. */
- if (nthreads < old_threads_used)
- gomp_barrier_reinit (&gomp_threads_dock, nthreads);
+ if (__builtin_expect (nthreads < old_threads_used, 0))
+ {
+ long diff = (long) nthreads - (long) old_threads_used;
+
+ gomp_barrier_reinit (&gomp_threads_dock, nthreads);
+
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_fetch_and_add (&gomp_managed_threads, diff);
+#else
+ gomp_mutex_lock (&gomp_remaining_threads_lock);
+ gomp_managed_threads += diff;
+ gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+ }
}
@@ -357,6 +385,17 @@ gomp_team_end (void)
gomp_end_task ();
thr->ts = team->prev_ts;
+ if (__builtin_expect (thr->ts.team != NULL, 0))
+ {
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
+#else
+ gomp_mutex_lock (&gomp_remaining_threads_lock);
+ gomp_managed_threads -= team->nthreads - 1L;
+ gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+ }
+
free_team (team);
}
--- libgomp/env.c (revision 133291)
+++ libgomp/env.c (working copy)
@@ -57,7 +57,6 @@ struct gomp_task_icv gomp_global_icv = {
};
unsigned short *gomp_cpu_affinity;
-bool gomp_active_wait_policy = false;
size_t gomp_cpu_affinity_len;
unsigned long gomp_max_active_levels_var = INT_MAX;
unsigned long gomp_thread_limit_var = ULONG_MAX;
@@ -65,8 +64,8 @@ unsigned long gomp_remaining_threads_cou
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_t gomp_remaining_threads_lock;
#endif
-static unsigned long gomp_block_time_var;
-unsigned long long gomp_spin_count_var;
+unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
/* Parse the OMP_SCHEDULE environment variable. */
@@ -239,14 +238,14 @@ parse_stacksize (const char *name, unsig
return false;
}
-/* Parse the GOMP_BLOCKTIME environment varible. Return true if one was
+/* Parse the GOMP_SPINCOUNT environment varible. Return true if one was
present and it was successfully parsed. */
static bool
-parse_millis (const char *name, unsigned long *pvalue)
+parse_spincount (const char *name, unsigned long long *pvalue)
{
char *env, *end;
- unsigned long value, mult = 1;
+ unsigned long long value, mult = 1;
env = getenv (name);
if (env == NULL)
@@ -257,17 +256,16 @@ parse_millis (const char *name, unsigned
if (*env == '\0')
goto invalid;
- if (strncasecmp (env, "infinite", 8) != 0
- || strncasecmp (env, "infinity", 8) != 0
- || strncasecmp (env, "unexpire", 8) != 0)
+ if (strncasecmp (env, "infinite", 8) == 0
+ || strncasecmp (env, "infinity", 8) == 0)
{
- value = ULONG_MAX;
+ value = ~0ULL;
end = env + 8;
goto check_tail;
}
errno = 0;
- value = strtoul (env, &end, 10);
+ value = strtoull (env, &end, 10);
if (errno)
goto invalid;
@@ -277,17 +275,17 @@ parse_millis (const char *name, unsigned
{
switch (tolower (*end))
{
- case 's':
- mult = 1000;
+ case 'k':
+ mult = 1000LL;
break;
case 'm':
- mult = 60 * 1000;
+ mult = 1000LL * 1000LL;
break;
- case 'h':
- mult = 60 * 60 * 1000;
+ case 'g':
+ mult = 1000LL * 1000LL * 1000LL;
break;
- case 'd':
- mult = 24 * 60 * 60 * 1000;
+ case 't':
+ mult = 1000LL * 1000LL * 1000LL * 1000LL;
break;
default:
goto invalid;
@@ -300,8 +298,8 @@ parse_millis (const char *name, unsigned
goto invalid;
}
- if (value > ULONG_MAX / mult)
- value = ULONG_MAX;
+ if (value > ~0ULL / mult)
+ value = ~0ULL;
else
value *= mult;
@@ -348,33 +346,36 @@ parse_boolean (const char *name, bool *v
/* Parse the OMP_WAIT_POLICY environment variable and store the
result in gomp_active_wait_policy. */
-static void
+static int
parse_wait_policy (void)
{
const char *env;
+ int ret = -1;
env = getenv ("OMP_WAIT_POLICY");
if (env == NULL)
- return;
+ return -1;
while (isspace ((unsigned char) *env))
++env;
if (strncasecmp (env, "active", 6) == 0)
{
- gomp_active_wait_policy = true;
+ ret = 1;
env += 6;
}
else if (strncasecmp (env, "passive", 7) == 0)
{
- gomp_active_wait_policy = false;
+ ret = 0;
env += 7;
}
else
env = "X";
while (isspace ((unsigned char) *env))
++env;
- if (*env != '\0')
- gomp_error ("Invalid value for environment variable OMP_WAIT_POLICY");
+ if (*env == '\0')
+ return ret;
+ gomp_error ("Invalid value for environment variable OMP_WAIT_POLICY");
+ return -1;
}
/* Parse the GOMP_CPU_AFFINITY environment varible. Return true if one was
@@ -472,6 +473,7 @@ static void __attribute__((constructor))
initialize_env (void)
{
unsigned long stacksize;
+ int wait_policy;
/* Do a compile time check that mkomp_h.pl did good job. */
omp_check_defines ();
@@ -479,7 +481,6 @@ initialize_env (void)
parse_schedule ();
parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
- parse_wait_policy ();
parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var);
parse_unsigned_long ("OMP_THREAD_LIMIT", &gomp_thread_limit_var);
if (gomp_thread_limit_var != ULONG_MAX)
@@ -489,23 +490,34 @@ initialize_env (void)
gomp_mutex_init (&gomp_remaining_threads_lock);
#endif
}
+ gomp_init_num_threads ();
+ gomp_available_cpus = gomp_global_icv.nthreads_var;
if (!parse_unsigned_long ("OMP_NUM_THREADS", &gomp_global_icv.nthreads_var))
- gomp_init_num_threads ();
+ gomp_global_icv.nthreads_var = gomp_available_cpus;
if (parse_affinity ())
gomp_init_affinity ();
- if (!parse_millis ("GOMP_BLOCKTIME", &gomp_block_time_var))
- {
- if (gomp_active_wait_policy)
- gomp_block_time_var = 200; /* 200ms */
- }
- if (gomp_block_time_var > 0)
+ wait_policy = parse_wait_policy ();
+ if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
{
- if (gomp_block_time_var == ULONG_MAX)
- gomp_spin_count_var = ~0ULL;
- else
- /* Estimate translation of gomp_block_time_var in milliseconds to
- spin count. */;
- }
+ /* Using a rough estimation of 100000 spins per msec,
+ use 5 min blocking for OMP_WAIT_POLICY=active,
+ 200 msec blocking when OMP_WAIT_POLICY is not specificed
+ and 0 when OMP_WAIT_POLICY=passive.
+ Depending on the CPU speed, this can be e.g. 5 times longer
+ or 5 times shorter. */
+ if (wait_policy > 0)
+ gomp_spin_count_var = 30000000000LL;
+ else if (wait_policy < 0)
+ gomp_spin_count_var = 20000000LL;
+ }
+ /* gomp_throttled_spin_count_var is used when there are more libgomp
+ managed threads than available CPUs. Use very short spinning. */
+ if (wait_policy > 0)
+ gomp_throttled_spin_count_var = 1000LL;
+ else if (wait_policy < 0)
+ gomp_throttled_spin_count_var = 100LL;
+ if (gomp_throttled_spin_count_var > gomp_spin_count_var)
+ gomp_throttled_spin_count_var = gomp_spin_count_var;
/* Not strictly environment related, but ordering constructors is tricky. */
pthread_attr_init (&gomp_thread_attr);
--- libgomp/libgomp.h (revision 133305)
+++ libgomp/libgomp.h (working copy)
@@ -190,8 +190,8 @@ extern unsigned long gomp_remaining_thre
extern gomp_mutex_t gomp_remaining_threads_lock;
#endif
extern unsigned long gomp_max_active_levels_var;
-extern bool gomp_active_wait_policy;
-extern unsigned long long gomp_spin_count_var;
+extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+extern unsigned long gomp_available_cpus, gomp_managed_threads;
/* This structure describes a "task" to be run by a thread. At present
we implement only synchronous tasks, i.e. no tasks are deferred or
--- libgomp/config/linux/wait.h (revision 133339)
+++ libgomp/config/linux/wait.h (working copy)
@@ -51,6 +51,8 @@ static inline void do_wait (int *addr, i
{
unsigned long long i, count = gomp_spin_count_var;
+ if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0))
+ count = gomp_throttled_spin_count_var;
for (i = 0; i < count; i++)
if (__builtin_expect (*addr != val, 0))
return;
Jakub
[-- Attachment #2: spins_per_msec.c --]
[-- Type: text/plain, Size: 2554 bytes --]
#include <time.h>
#include <stdio.h>
/* Don't use CLOCK_THREAD_CPUTIME_ID with LinuxThreads. */
#if defined __GLIBC__ && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 5))
# define clockid CLOCK_THREAD_CPUTIME_ID
#else
# define clockid CLOCK_PROCESS_CPUTIME_ID
#endif
static inline void
cpu_relax (void)
{
#if defined __i386__ || defined __x86_64__
__asm volatile ("rep; nop" : : : "memory");
#elif defined __ia64__
__asm volatile ("hint @pause" : : : "memory");
#else
__asm volatile ("" : : : "memory");
#endif
}
unsigned long long gomp_spin_count_var;
static void
__attribute__((noinline))
do_wait (int *addr, int val)
{
unsigned long long i, count = gomp_spin_count_var;
for (i = 0; i < count; i++)
if (__builtin_expect (*addr != val, 0))
return;
else
cpu_relax ();
}
long
spins_per_msec (void)
{
struct timespec ts1, ts2, ts3, ts4;
int x;
/* If clock isn't supported or has too low resolution, fail. */
if (clock_getres (clockid, &ts1) < 0
|| ts1.tv_sec
|| ts1.tv_nsec > 50)
return -1;
/* Warm up the loop. */
gomp_spin_count_var = 10;
do_wait (&x, 0);
gomp_spin_count_var = 100;
clock_gettime (clockid, &ts1);
clock_gettime (clockid, &ts2);
do_wait (&x, 0);
clock_gettime (clockid, &ts3);
do_wait (&x, 0);
clock_gettime (clockid, &ts4);
ts4.tv_sec -= ts3.tv_sec;
ts4.tv_nsec -= ts3.tv_nsec;
if (ts4.tv_nsec < 0)
{
ts4.tv_sec--;
ts4.tv_nsec += 1000000000;
}
ts3.tv_sec -= ts2.tv_sec;
ts3.tv_nsec -= ts2.tv_nsec;
if (ts3.tv_nsec < 0)
{
ts3.tv_sec--;
ts3.tv_nsec += 1000000000;
}
ts2.tv_sec -= ts1.tv_sec;
ts2.tv_nsec -= ts1.tv_nsec;
if (ts2.tv_nsec < 0)
{
ts2.tv_sec--;
ts2.tv_nsec += 1000000000;
}
if (ts2.tv_sec || ts3.tv_sec || ts4.tv_sec)
return -1;
if (ts2.tv_nsec <= ts3.tv_nsec && ts2.tv_nsec <= ts4.tv_nsec)
{
ts3.tv_nsec -= ts2.tv_nsec;
ts4.tv_nsec -= ts2.tv_nsec;
}
return 200000000 / (ts3.tv_nsec + ts4.tv_nsec);
}
int
main (void)
{
long spms = spins_per_msec ();
printf ("%ld\n", spms);
gomp_spin_count_var = spms;
struct timespec ts1, ts2;
clock_gettime (clockid, &ts1);
int x = 0;
do_wait (&x, 0);
clock_gettime (clockid, &ts2);
ts2.tv_sec -= ts1.tv_sec;
ts2.tv_nsec -= ts1.tv_nsec;
if (ts2.tv_nsec < 0)
{
ts2.tv_sec--;
ts2.tv_nsec += 1000000000;
}
/* This should ideally print something around 0 1000000 */
printf ("%ld %ld\n", (long) ts2.tv_sec, (long) ts2.tv_nsec);
return 0;
}
^ permalink raw reply [flat|nested] 2+ messages in thread
* RE: [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and handle OMP_WAIT_POLICY
2008-03-19 13:47 [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and handle OMP_WAIT_POLICY Jakub Jelinek
@ 2008-03-19 23:40 ` Liaskovitis, Vasileios
0 siblings, 0 replies; 2+ messages in thread
From: Liaskovitis, Vasileios @ 2008-03-19 23:40 UTC (permalink / raw)
To: Jakub Jelinek, gcc-patches
Hi,
I 've measured the performance improvements of the gomp3 patches up
until Tuesday (revision 133292, after schedule speedups patch, i.e.
still using GOMP_BLOCKTIME) running EPCC(syncbench subtest) on a 4-node
AMD x86-64 quad-core (16 cores total).
The c-version syncbench test from the EPCC microbenchmarks
(http://www2.epcc.ed.ac.uk/computing/research_activities/openmpbench/dow
nload.html) was compiled with -O2 -fopenmp using gcc4.3.0
4.3.0 is gcc4.3.0 release libgomp
gomp3-futex is gomp3 rev133292 with GOMP_BLOCKTIME=0
gomp3-block is gomp3 rev133292 with GOMP_BLOCKTIME=infinity
All numbers are percentages of 4.3.0 (release) libgomp overhead times
for the specified omp pragmas. Results are averaged across 5 runs.
BARRIER 2threads 4threads 8threads 16threads
4.3.0 100.00% 100.00% 100.00% 100.00%
gomp3-futex 74.33% 62.77% 49.60% 99.62%
gomp3-block 8.39% 6.72% 4.77% 3.34%
FOR 2threads 4threads 8threads
16threads
4.3.0 100.00% 100.00% 100.00% 100.00%
gomp3-futex 85.81% 62.76% 49.27% 99.48%
gomp3-block 8.41% 6.69% 4.80% 3.49%
PARALLEL 2threads 4threads 8threads
16threads
4.3.0 100.00% 100.00% 100.00% 100.00%
gomp3-futex 106.64% 66.15% 64.69% 92.23%
gomp3-block 17.75% 9.69% 8.82% 8.77%
SINGLE 2threads 4threads 8threads 16threads
4.3.0 100.00% 100.00% 100.00% 100.00%
gomp3-futex 69.32% 50.00% 56.04% 91.23%
gomp3-block 36.40% 34.61% 35.06% 60.34%
REDUCTION 2threads 4threads 8threads
16threads
4.3.0 100.00% 100.00% 100.00% 100.00%
gomp3-futex 106.01% 65.00% 65.47% 93.05%
gomp3-block 18.04% 12.17% 12.46% 11.76%
Using busywait instead of futex calls provides good improvements and
scaling.
Gomp3-futex is also faster than 4.3.0 in some cases. What other
improvements cause this? Does eliminating the malloc calls (in the
parallel speedup patch) account for most of the difference?
Thanks,
- Vasilis
-----Original Message-----
From: gcc-patches-owner@gcc.gnu.org
[mailto:gcc-patches-owner@gcc.gnu.org] On Behalf Of Jakub Jelinek
Sent: Wednesday, March 19, 2008 8:05 AM
To: gcc-patches@gcc.gnu.org
Cc: Ulrich Drepper
Subject: [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and
handle OMP_WAIT_POLICY
Hi!
I've been playing with runtime detection of the number of
do_wait spins per millisecond (see attached proglet), unfortunately
on some architectures (e.g. ppc G5) it is completely unreliable, on
others,
including x86_64, it is from time to time 10 times off. But estimating
the spin count so that it is at most 10 times off at least on most CPUs
doesn't need any runtime detection, especially given that CPU
frequencies
hit the ceiling.
So, instead of letting users specify GOMP_BLOCKCOUNT as time in
milliseconds
to spin, this patch lets users specify GOMP_SPINCOUNT as number of
spins.
If this isn't specified, the default is 30g for OMP_WAIT_POLICY=active
(30g is roughly 5 minutes, could be 2 or 15 minutes depending on hw)
20m for no OMP_WAIT_POLICY (roughly 0.2 seconds) and 0 for
OMP_WAIT_POLICY=passive. Additionally, the patch tracks number of
threads
currently managed by libgomp and if there are more libgomp managed
threads
than available CPUs, the spin counts decrease radically (1k for
OMP_WAIT_POLICY=active, 100 for no OMP_WAIT_POLICY).
The busy waiting duration is a hint anyway, so being not very precise is
IMHO not a big deal, but e.g. doing clock_gettime every few iterations
would
increase a latency a lot.
2008-03-19 Jakub Jelinek <jakub@redhat.com>
* libgomp.h (gomp_active_wait_policy): Remove decl.
(gomp_throttled_spin_count_var, gomp_available_cpus,
gomp_managed_threads): New extern decls.
* team.c (gomp_team_start, gomp_team_end): If number of threads
changed, adjust atomically gomp_managed_threads.
* env.c (gomp_active_wait_policy, gomp_block_time_var): Remove.
(gomp_throttled_spin_count_var, gomp_available_cpus,
gomp_managed_threads): New variables.
(parse_millis): Removed.
(parse_spincount): New function.
(parse_wait_policy): Return -1/0/1 instead of setting
gomp_active_wait_policy.
(initialize_env): Call gomp_init_num_threads unconditionally.
Initialize gomp_available_cpus. Call parse_spincount instead
of parse_millis, initialize gomp_{,throttled_}spin_count_var
depending on presence and value of OMP_WAIT_POLICY and
GOMP_SPINCOUNT env vars.
* config/linux/wait.h (do_wait): Use
gomp_throttled_spin_count_var
instead of gomp_spin_count_var if gomp_managed_threads >
gomp_available_cpus.
--- libgomp/team.c (revision 133292)
+++ libgomp/team.c (working copy)
@@ -287,8 +287,24 @@ gomp_team_start (void (*fn) (void *), vo
}
}
+ if (__builtin_expect (nthreads > old_threads_used, 0))
+ {
+ long diff = (long) nthreads - (long) old_threads_used;
+
+ if (old_threads_used == 0)
+ --diff;
+
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_fetch_and_add (&gomp_managed_threads, diff);
+#else
+ gomp_mutex_lock (&gomp_remaining_threads_lock);
+ gomp_managed_threads += diff;
+ gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+ }
+
attr = &gomp_thread_attr;
- if (gomp_cpu_affinity != NULL)
+ if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
{
size_t stacksize;
pthread_attr_init (&thread_attr);
@@ -328,7 +344,7 @@ gomp_team_start (void (*fn) (void *), vo
gomp_fatal ("Thread creation failed: %s", strerror (err));
}
- if (gomp_cpu_affinity != NULL)
+ if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
pthread_attr_destroy (&thread_attr);
do_release:
@@ -338,8 +354,20 @@ gomp_team_start (void (*fn) (void *), vo
that should arrive back at the end of this team. The extra
threads should be exiting. Note that we arrange for this test
to never be true for nested teams. */
- if (nthreads < old_threads_used)
- gomp_barrier_reinit (&gomp_threads_dock, nthreads);
+ if (__builtin_expect (nthreads < old_threads_used, 0))
+ {
+ long diff = (long) nthreads - (long) old_threads_used;
+
+ gomp_barrier_reinit (&gomp_threads_dock, nthreads);
+
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_fetch_and_add (&gomp_managed_threads, diff);
+#else
+ gomp_mutex_lock (&gomp_remaining_threads_lock);
+ gomp_managed_threads += diff;
+ gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+ }
}
@@ -357,6 +385,17 @@ gomp_team_end (void)
gomp_end_task ();
thr->ts = team->prev_ts;
+ if (__builtin_expect (thr->ts.team != NULL, 0))
+ {
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_fetch_and_add (&gomp_managed_threads, 1L -
team->nthreads);
+#else
+ gomp_mutex_lock (&gomp_remaining_threads_lock);
+ gomp_managed_threads -= team->nthreads - 1L;
+ gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+ }
+
free_team (team);
}
--- libgomp/env.c (revision 133291)
+++ libgomp/env.c (working copy)
@@ -57,7 +57,6 @@ struct gomp_task_icv gomp_global_icv = {
};
unsigned short *gomp_cpu_affinity;
-bool gomp_active_wait_policy = false;
size_t gomp_cpu_affinity_len;
unsigned long gomp_max_active_levels_var = INT_MAX;
unsigned long gomp_thread_limit_var = ULONG_MAX;
@@ -65,8 +64,8 @@ unsigned long gomp_remaining_threads_cou
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_t gomp_remaining_threads_lock;
#endif
-static unsigned long gomp_block_time_var;
-unsigned long long gomp_spin_count_var;
+unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
/* Parse the OMP_SCHEDULE environment variable. */
@@ -239,14 +238,14 @@ parse_stacksize (const char *name, unsig
return false;
}
-/* Parse the GOMP_BLOCKTIME environment varible. Return true if one
was
+/* Parse the GOMP_SPINCOUNT environment varible. Return true if one
was
present and it was successfully parsed. */
static bool
-parse_millis (const char *name, unsigned long *pvalue)
+parse_spincount (const char *name, unsigned long long *pvalue)
{
char *env, *end;
- unsigned long value, mult = 1;
+ unsigned long long value, mult = 1;
env = getenv (name);
if (env == NULL)
@@ -257,17 +256,16 @@ parse_millis (const char *name, unsigned
if (*env == '\0')
goto invalid;
- if (strncasecmp (env, "infinite", 8) != 0
- || strncasecmp (env, "infinity", 8) != 0
- || strncasecmp (env, "unexpire", 8) != 0)
+ if (strncasecmp (env, "infinite", 8) == 0
+ || strncasecmp (env, "infinity", 8) == 0)
{
- value = ULONG_MAX;
+ value = ~0ULL;
end = env + 8;
goto check_tail;
}
errno = 0;
- value = strtoul (env, &end, 10);
+ value = strtoull (env, &end, 10);
if (errno)
goto invalid;
@@ -277,17 +275,17 @@ parse_millis (const char *name, unsigned
{
switch (tolower (*end))
{
- case 's':
- mult = 1000;
+ case 'k':
+ mult = 1000LL;
break;
case 'm':
- mult = 60 * 1000;
+ mult = 1000LL * 1000LL;
break;
- case 'h':
- mult = 60 * 60 * 1000;
+ case 'g':
+ mult = 1000LL * 1000LL * 1000LL;
break;
- case 'd':
- mult = 24 * 60 * 60 * 1000;
+ case 't':
+ mult = 1000LL * 1000LL * 1000LL * 1000LL;
break;
default:
goto invalid;
@@ -300,8 +298,8 @@ parse_millis (const char *name, unsigned
goto invalid;
}
- if (value > ULONG_MAX / mult)
- value = ULONG_MAX;
+ if (value > ~0ULL / mult)
+ value = ~0ULL;
else
value *= mult;
@@ -348,33 +346,36 @@ parse_boolean (const char *name, bool *v
/* Parse the OMP_WAIT_POLICY environment variable and store the
result in gomp_active_wait_policy. */
-static void
+static int
parse_wait_policy (void)
{
const char *env;
+ int ret = -1;
env = getenv ("OMP_WAIT_POLICY");
if (env == NULL)
- return;
+ return -1;
while (isspace ((unsigned char) *env))
++env;
if (strncasecmp (env, "active", 6) == 0)
{
- gomp_active_wait_policy = true;
+ ret = 1;
env += 6;
}
else if (strncasecmp (env, "passive", 7) == 0)
{
- gomp_active_wait_policy = false;
+ ret = 0;
env += 7;
}
else
env = "X";
while (isspace ((unsigned char) *env))
++env;
- if (*env != '\0')
- gomp_error ("Invalid value for environment variable
OMP_WAIT_POLICY");
+ if (*env == '\0')
+ return ret;
+ gomp_error ("Invalid value for environment variable
OMP_WAIT_POLICY");
+ return -1;
}
/* Parse the GOMP_CPU_AFFINITY environment varible. Return true if one
was
@@ -472,6 +473,7 @@ static void __attribute__((constructor))
initialize_env (void)
{
unsigned long stacksize;
+ int wait_policy;
/* Do a compile time check that mkomp_h.pl did good job. */
omp_check_defines ();
@@ -479,7 +481,6 @@ initialize_env (void)
parse_schedule ();
parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
- parse_wait_policy ();
parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS",
&gomp_max_active_levels_var);
parse_unsigned_long ("OMP_THREAD_LIMIT", &gomp_thread_limit_var);
if (gomp_thread_limit_var != ULONG_MAX)
@@ -489,23 +490,34 @@ initialize_env (void)
gomp_mutex_init (&gomp_remaining_threads_lock);
#endif
}
+ gomp_init_num_threads ();
+ gomp_available_cpus = gomp_global_icv.nthreads_var;
if (!parse_unsigned_long ("OMP_NUM_THREADS",
&gomp_global_icv.nthreads_var))
- gomp_init_num_threads ();
+ gomp_global_icv.nthreads_var = gomp_available_cpus;
if (parse_affinity ())
gomp_init_affinity ();
- if (!parse_millis ("GOMP_BLOCKTIME", &gomp_block_time_var))
- {
- if (gomp_active_wait_policy)
- gomp_block_time_var = 200; /* 200ms */
- }
- if (gomp_block_time_var > 0)
+ wait_policy = parse_wait_policy ();
+ if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
{
- if (gomp_block_time_var == ULONG_MAX)
- gomp_spin_count_var = ~0ULL;
- else
- /* Estimate translation of gomp_block_time_var in milliseconds
to
- spin count. */;
- }
+ /* Using a rough estimation of 100000 spins per msec,
+ use 5 min blocking for OMP_WAIT_POLICY=active,
+ 200 msec blocking when OMP_WAIT_POLICY is not specificed
+ and 0 when OMP_WAIT_POLICY=passive.
+ Depending on the CPU speed, this can be e.g. 5 times longer
+ or 5 times shorter. */
+ if (wait_policy > 0)
+ gomp_spin_count_var = 30000000000LL;
+ else if (wait_policy < 0)
+ gomp_spin_count_var = 20000000LL;
+ }
+ /* gomp_throttled_spin_count_var is used when there are more libgomp
+ managed threads than available CPUs. Use very short spinning. */
+ if (wait_policy > 0)
+ gomp_throttled_spin_count_var = 1000LL;
+ else if (wait_policy < 0)
+ gomp_throttled_spin_count_var = 100LL;
+ if (gomp_throttled_spin_count_var > gomp_spin_count_var)
+ gomp_throttled_spin_count_var = gomp_spin_count_var;
/* Not strictly environment related, but ordering constructors is
tricky. */
pthread_attr_init (&gomp_thread_attr);
--- libgomp/libgomp.h (revision 133305)
+++ libgomp/libgomp.h (working copy)
@@ -190,8 +190,8 @@ extern unsigned long gomp_remaining_thre
extern gomp_mutex_t gomp_remaining_threads_lock;
#endif
extern unsigned long gomp_max_active_levels_var;
-extern bool gomp_active_wait_policy;
-extern unsigned long long gomp_spin_count_var;
+extern unsigned long long gomp_spin_count_var,
gomp_throttled_spin_count_var;
+extern unsigned long gomp_available_cpus, gomp_managed_threads;
/* This structure describes a "task" to be run by a thread. At present
we implement only synchronous tasks, i.e. no tasks are deferred or
--- libgomp/config/linux/wait.h (revision 133339)
+++ libgomp/config/linux/wait.h (working copy)
@@ -51,6 +51,8 @@ static inline void do_wait (int *addr, i
{
unsigned long long i, count = gomp_spin_count_var;
+ if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0))
+ count = gomp_throttled_spin_count_var;
for (i = 0; i < count; i++)
if (__builtin_expect (*addr != val, 0))
return;
Jakub
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2008-03-19 22:31 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-19 13:47 [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and handle OMP_WAIT_POLICY Jakub Jelinek
2008-03-19 23:40 ` Liaskovitis, Vasileios
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).