public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and handle OMP_WAIT_POLICY
@ 2008-03-19 13:47 Jakub Jelinek
  2008-03-19 23:40 ` Liaskovitis, Vasileios
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2008-03-19 13:47 UTC (permalink / raw)
  To: gcc-patches; +Cc: Ulrich Drepper

[-- Attachment #1: Type: text/plain, Size: 11787 bytes --]

Hi!

I've been playing with runtime detection of the number of
do_wait spins per millisecond (see attached proglet), unfortunately
on some architectures (e.g. ppc G5) it is completely unreliable, on others,
including x86_64, it is from time to time 10 times off.  But estimating
the spin count so that it is at most 10 times off at least on most CPUs
doesn't need any runtime detection, especially given that CPU frequencies
hit the ceiling.
So, instead of letting users specify GOMP_BLOCKCOUNT as time in milliseconds
to spin, this patch lets users specify GOMP_SPINCOUNT as number of spins.
If this isn't specified, the default is 30g for OMP_WAIT_POLICY=active
(30g is roughly 5 minutes, could be 2 or 15 minutes depending on hw)
20m for no OMP_WAIT_POLICY (roughly 0.2 seconds) and 0 for
OMP_WAIT_POLICY=passive.  Additionally, the patch tracks number of threads
currently managed by libgomp and if there are more libgomp managed threads
than available CPUs, the spin counts decrease radically (1k for
OMP_WAIT_POLICY=active, 100 for no OMP_WAIT_POLICY).
The busy waiting duration is a hint anyway, so being not very precise is
IMHO not a big deal, but e.g. doing clock_gettime every few iterations would
increase a latency a lot.

2008-03-19  Jakub Jelinek  <jakub@redhat.com>

	* libgomp.h (gomp_active_wait_policy): Remove decl.
	(gomp_throttled_spin_count_var, gomp_available_cpus,
	gomp_managed_threads): New extern decls.
	* team.c (gomp_team_start, gomp_team_end): If number of threads
	changed, adjust atomically gomp_managed_threads.
	* env.c (gomp_active_wait_policy, gomp_block_time_var): Remove.
	(gomp_throttled_spin_count_var, gomp_available_cpus,
	gomp_managed_threads): New variables.
	(parse_millis): Removed.
	(parse_spincount): New function.
	(parse_wait_policy): Return -1/0/1 instead of setting
	gomp_active_wait_policy.
	(initialize_env): Call gomp_init_num_threads unconditionally.
	Initialize gomp_available_cpus.  Call parse_spincount instead
	of parse_millis, initialize gomp_{,throttled_}spin_count_var
	depending on presence and value of OMP_WAIT_POLICY and
	GOMP_SPINCOUNT env vars.
	* config/linux/wait.h (do_wait): Use gomp_throttled_spin_count_var
	instead of gomp_spin_count_var if gomp_managed_threads >
	gomp_available_cpus.

--- libgomp/team.c	(revision 133292)
+++ libgomp/team.c	(working copy)
@@ -287,8 +287,24 @@ gomp_team_start (void (*fn) (void *), vo
 	}
     }
 
+  if (__builtin_expect (nthreads > old_threads_used, 0))
+    {
+      long diff = (long) nthreads - (long) old_threads_used;
+
+      if (old_threads_used == 0)
+	--diff;
+
+#ifdef HAVE_SYNC_BUILTINS
+      __sync_fetch_and_add (&gomp_managed_threads, diff);
+#else
+      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_managed_threads += diff;
+      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+    }
+
   attr = &gomp_thread_attr;
-  if (gomp_cpu_affinity != NULL)
+  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
     {
       size_t stacksize;
       pthread_attr_init (&thread_attr);
@@ -328,7 +344,7 @@ gomp_team_start (void (*fn) (void *), vo
 	gomp_fatal ("Thread creation failed: %s", strerror (err));
     }
 
-  if (gomp_cpu_affinity != NULL)
+  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
     pthread_attr_destroy (&thread_attr);
 
  do_release:
@@ -338,8 +354,20 @@ gomp_team_start (void (*fn) (void *), vo
      that should arrive back at the end of this team.  The extra
      threads should be exiting.  Note that we arrange for this test
      to never be true for nested teams.  */
-  if (nthreads < old_threads_used)
-    gomp_barrier_reinit (&gomp_threads_dock, nthreads);
+  if (__builtin_expect (nthreads < old_threads_used, 0))
+    {
+      long diff = (long) nthreads - (long) old_threads_used;
+
+      gomp_barrier_reinit (&gomp_threads_dock, nthreads);
+
+#ifdef HAVE_SYNC_BUILTINS
+      __sync_fetch_and_add (&gomp_managed_threads, diff);
+#else
+      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_managed_threads += diff;
+      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+    }
 }
 
 
@@ -357,6 +385,17 @@ gomp_team_end (void)
   gomp_end_task ();
   thr->ts = team->prev_ts;
 
+  if (__builtin_expect (thr->ts.team != NULL, 0))
+    {
+#ifdef HAVE_SYNC_BUILTINS
+      __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
+#else
+      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_managed_threads -= team->nthreads - 1L;
+      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+    }
+
   free_team (team);
 }
 
--- libgomp/env.c	(revision 133291)
+++ libgomp/env.c	(working copy)
@@ -57,7 +57,6 @@ struct gomp_task_icv gomp_global_icv = {
 };
 
 unsigned short *gomp_cpu_affinity;
-bool gomp_active_wait_policy = false;
 size_t gomp_cpu_affinity_len;
 unsigned long gomp_max_active_levels_var = INT_MAX;
 unsigned long gomp_thread_limit_var = ULONG_MAX;
@@ -65,8 +64,8 @@ unsigned long gomp_remaining_threads_cou
 #ifndef HAVE_SYNC_BUILTINS
 gomp_mutex_t gomp_remaining_threads_lock;
 #endif
-static unsigned long gomp_block_time_var;
-unsigned long long gomp_spin_count_var;
+unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
 
 /* Parse the OMP_SCHEDULE environment variable.  */
 
@@ -239,14 +238,14 @@ parse_stacksize (const char *name, unsig
   return false;
 }
 
-/* Parse the GOMP_BLOCKTIME environment varible.  Return true if one was
+/* Parse the GOMP_SPINCOUNT environment varible.  Return true if one was
    present and it was successfully parsed.  */
 
 static bool
-parse_millis (const char *name, unsigned long *pvalue)
+parse_spincount (const char *name, unsigned long long *pvalue)
 {
   char *env, *end;
-  unsigned long value, mult = 1;
+  unsigned long long value, mult = 1;
 
   env = getenv (name);
   if (env == NULL)
@@ -257,17 +256,16 @@ parse_millis (const char *name, unsigned
   if (*env == '\0')
     goto invalid;
 
-  if (strncasecmp (env, "infinite", 8) != 0
-      || strncasecmp (env, "infinity", 8) != 0
-      || strncasecmp (env, "unexpire", 8) != 0)
+  if (strncasecmp (env, "infinite", 8) == 0
+      || strncasecmp (env, "infinity", 8) == 0)
     {
-      value = ULONG_MAX;
+      value = ~0ULL;
       end = env + 8;
       goto check_tail;
     }
 
   errno = 0;
-  value = strtoul (env, &end, 10);
+  value = strtoull (env, &end, 10);
   if (errno)
     goto invalid;
 
@@ -277,17 +275,17 @@ parse_millis (const char *name, unsigned
     {
       switch (tolower (*end))
 	{
-	case 's':
-	  mult = 1000;
+	case 'k':
+	  mult = 1000LL;
 	  break;
 	case 'm':
-	  mult = 60 * 1000;
+	  mult = 1000LL * 1000LL;
 	  break;
-	case 'h':
-	  mult = 60 * 60 * 1000;
+	case 'g':
+	  mult = 1000LL * 1000LL * 1000LL;
 	  break;
-	case 'd':
-	  mult = 24 * 60 * 60 * 1000;
+	case 't':
+	  mult = 1000LL * 1000LL * 1000LL * 1000LL;
 	  break;
 	default:
 	  goto invalid;
@@ -300,8 +298,8 @@ parse_millis (const char *name, unsigned
 	goto invalid;
     }
 
-  if (value > ULONG_MAX / mult)
-    value = ULONG_MAX;
+  if (value > ~0ULL / mult)
+    value = ~0ULL;
   else
     value *= mult;
 
@@ -348,33 +346,36 @@ parse_boolean (const char *name, bool *v
 /* Parse the OMP_WAIT_POLICY environment variable and store the
    result in gomp_active_wait_policy.  */
 
-static void
+static int
 parse_wait_policy (void)
 {
   const char *env;
+  int ret = -1;
 
   env = getenv ("OMP_WAIT_POLICY");
   if (env == NULL)
-    return;
+    return -1;
 
   while (isspace ((unsigned char) *env))
     ++env;
   if (strncasecmp (env, "active", 6) == 0)
     {
-      gomp_active_wait_policy = true;
+      ret = 1;
       env += 6;
     }
   else if (strncasecmp (env, "passive", 7) == 0)
     {
-      gomp_active_wait_policy = false;
+      ret = 0;
       env += 7;
     }
   else
     env = "X";
   while (isspace ((unsigned char) *env))
     ++env;
-  if (*env != '\0')
-    gomp_error ("Invalid value for environment variable OMP_WAIT_POLICY");
+  if (*env == '\0')
+    return ret;
+  gomp_error ("Invalid value for environment variable OMP_WAIT_POLICY");
+  return -1;
 }
 
 /* Parse the GOMP_CPU_AFFINITY environment varible.  Return true if one was
@@ -472,6 +473,7 @@ static void __attribute__((constructor))
 initialize_env (void)
 {
   unsigned long stacksize;
+  int wait_policy;
 
   /* Do a compile time check that mkomp_h.pl did good job.  */
   omp_check_defines ();
@@ -479,7 +481,6 @@ initialize_env (void)
   parse_schedule ();
   parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
-  parse_wait_policy ();
   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var);
   parse_unsigned_long ("OMP_THREAD_LIMIT", &gomp_thread_limit_var);
   if (gomp_thread_limit_var != ULONG_MAX)
@@ -489,23 +490,34 @@ initialize_env (void)
       gomp_mutex_init (&gomp_remaining_threads_lock);
 #endif
     }
+  gomp_init_num_threads ();
+  gomp_available_cpus = gomp_global_icv.nthreads_var;
   if (!parse_unsigned_long ("OMP_NUM_THREADS", &gomp_global_icv.nthreads_var))
-    gomp_init_num_threads ();
+    gomp_global_icv.nthreads_var = gomp_available_cpus;
   if (parse_affinity ())
     gomp_init_affinity ();
-  if (!parse_millis ("GOMP_BLOCKTIME", &gomp_block_time_var))
-    {
-      if (gomp_active_wait_policy)
-	gomp_block_time_var = 200; /* 200ms */
-    }
-  if (gomp_block_time_var > 0)
+  wait_policy = parse_wait_policy ();
+  if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
     {
-      if (gomp_block_time_var == ULONG_MAX)
-	gomp_spin_count_var = ~0ULL;
-      else
-	/* Estimate translation of gomp_block_time_var in milliseconds to
-	   spin count.  */;
-    }
+      /* Using a rough estimation of 100000 spins per msec,
+	 use 5 min blocking for OMP_WAIT_POLICY=active,
+	 200 msec blocking when OMP_WAIT_POLICY is not specificed
+	 and 0 when OMP_WAIT_POLICY=passive.
+	 Depending on the CPU speed, this can be e.g. 5 times longer
+	 or 5 times shorter.  */
+      if (wait_policy > 0)
+	gomp_spin_count_var = 30000000000LL;
+      else if (wait_policy < 0)
+	gomp_spin_count_var = 20000000LL;
+    }
+  /* gomp_throttled_spin_count_var is used when there are more libgomp
+     managed threads than available CPUs.  Use very short spinning.  */
+  if (wait_policy > 0)
+    gomp_throttled_spin_count_var = 1000LL;
+  else if (wait_policy < 0)
+    gomp_throttled_spin_count_var = 100LL;
+  if (gomp_throttled_spin_count_var > gomp_spin_count_var)
+    gomp_throttled_spin_count_var = gomp_spin_count_var;
 
   /* Not strictly environment related, but ordering constructors is tricky.  */
   pthread_attr_init (&gomp_thread_attr);
--- libgomp/libgomp.h	(revision 133305)
+++ libgomp/libgomp.h	(working copy)
@@ -190,8 +190,8 @@ extern unsigned long gomp_remaining_thre
 extern gomp_mutex_t gomp_remaining_threads_lock;
 #endif
 extern unsigned long gomp_max_active_levels_var;
-extern bool gomp_active_wait_policy;
-extern unsigned long long gomp_spin_count_var;
+extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+extern unsigned long gomp_available_cpus, gomp_managed_threads;
 
 /* This structure describes a "task" to be run by a thread.  At present
    we implement only synchronous tasks, i.e. no tasks are deferred or
--- libgomp/config/linux/wait.h	(revision 133339)
+++ libgomp/config/linux/wait.h	(working copy)
@@ -51,6 +51,8 @@ static inline void do_wait (int *addr, i
 {
   unsigned long long i, count = gomp_spin_count_var;
 
+  if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0))
+    count = gomp_throttled_spin_count_var;
   for (i = 0; i < count; i++)
     if (__builtin_expect (*addr != val, 0))
       return;

	Jakub

[-- Attachment #2: spins_per_msec.c --]
[-- Type: text/plain, Size: 2554 bytes --]

#include <time.h>
#include <stdio.h>

/* Don't use CLOCK_THREAD_CPUTIME_ID with LinuxThreads.  */
#if defined __GLIBC__ && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 5))
# define clockid CLOCK_THREAD_CPUTIME_ID
#else
# define clockid CLOCK_PROCESS_CPUTIME_ID
#endif

static inline void
cpu_relax (void)
{
#if defined __i386__ || defined __x86_64__
  __asm volatile ("rep; nop" : : : "memory");
#elif defined __ia64__
  __asm volatile ("hint @pause" : : : "memory");
#else
  __asm volatile ("" : : : "memory");
#endif
}

unsigned long long gomp_spin_count_var;

static void
__attribute__((noinline))
do_wait (int *addr, int val)
{
  unsigned long long i, count = gomp_spin_count_var;
  
  for (i = 0; i < count; i++)
    if (__builtin_expect (*addr != val, 0))
      return;
    else
      cpu_relax ();
}

long
spins_per_msec (void)
{
  struct timespec ts1, ts2, ts3, ts4;
  int x;

  /* If clock isn't supported or has too low resolution, fail.  */
  if (clock_getres (clockid, &ts1) < 0
      || ts1.tv_sec
      || ts1.tv_nsec > 50)
    return -1;

  /* Warm up the loop.  */
  gomp_spin_count_var = 10;
  do_wait (&x, 0);
  gomp_spin_count_var = 100;

  clock_gettime (clockid, &ts1);
  clock_gettime (clockid, &ts2);
  do_wait (&x, 0);
  clock_gettime (clockid, &ts3);
  do_wait (&x, 0);
  clock_gettime (clockid, &ts4);
  ts4.tv_sec -= ts3.tv_sec;
  ts4.tv_nsec -= ts3.tv_nsec;
  if (ts4.tv_nsec < 0)
    {
      ts4.tv_sec--;
      ts4.tv_nsec += 1000000000;
    }
  ts3.tv_sec -= ts2.tv_sec;
  ts3.tv_nsec -= ts2.tv_nsec;
  if (ts3.tv_nsec < 0)
    {
      ts3.tv_sec--;
      ts3.tv_nsec += 1000000000;
    }
  ts2.tv_sec -= ts1.tv_sec;
  ts2.tv_nsec -= ts1.tv_nsec;
  if (ts2.tv_nsec < 0)
    {
      ts2.tv_sec--;
      ts2.tv_nsec += 1000000000;
    }
  if (ts2.tv_sec || ts3.tv_sec || ts4.tv_sec)
    return -1;

  if (ts2.tv_nsec <= ts3.tv_nsec && ts2.tv_nsec <= ts4.tv_nsec)
    {
      ts3.tv_nsec -= ts2.tv_nsec;
      ts4.tv_nsec -= ts2.tv_nsec;
    }

  return 200000000 / (ts3.tv_nsec + ts4.tv_nsec);
}

int
main (void)
{
  long spms = spins_per_msec ();
  printf ("%ld\n", spms);
  gomp_spin_count_var = spms;
  struct timespec ts1, ts2;
  clock_gettime (clockid, &ts1);
  int x = 0;
  do_wait (&x, 0);
  clock_gettime (clockid, &ts2);
  ts2.tv_sec -= ts1.tv_sec;
  ts2.tv_nsec -= ts1.tv_nsec;
  if (ts2.tv_nsec < 0)
    {
      ts2.tv_sec--;
      ts2.tv_nsec += 1000000000;
    }
  /* This should ideally print something around 0 1000000 */
  printf ("%ld %ld\n", (long) ts2.tv_sec, (long) ts2.tv_nsec);
  return 0;
}

^ permalink raw reply	[flat|nested] 2+ messages in thread

* RE: [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and handle OMP_WAIT_POLICY
  2008-03-19 13:47 [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and handle OMP_WAIT_POLICY Jakub Jelinek
@ 2008-03-19 23:40 ` Liaskovitis, Vasileios
  0 siblings, 0 replies; 2+ messages in thread
From: Liaskovitis, Vasileios @ 2008-03-19 23:40 UTC (permalink / raw)
  To: Jakub Jelinek, gcc-patches

Hi,

I 've measured the performance improvements of the gomp3 patches up
until Tuesday (revision 133292, after schedule speedups patch, i.e.
still using GOMP_BLOCKTIME) running EPCC(syncbench subtest) on a 4-node
AMD x86-64 quad-core (16 cores total).

The c-version syncbench test from the EPCC microbenchmarks
(http://www2.epcc.ed.ac.uk/computing/research_activities/openmpbench/dow
nload.html) was compiled with -O2 -fopenmp using gcc4.3.0

4.3.0 is gcc4.3.0 release libgomp
gomp3-futex is gomp3 rev133292 with GOMP_BLOCKTIME=0 
gomp3-block is gomp3 rev133292 with GOMP_BLOCKTIME=infinity

All numbers are percentages of 4.3.0 (release) libgomp overhead times
for the specified omp pragmas. Results are averaged across 5 runs.

BARRIER	2threads	4threads	8threads	16threads
4.3.0		100.00%	100.00%	100.00%	100.00%
gomp3-futex	74.33%	62.77%	49.60%	99.62%
gomp3-block	8.39%		6.72%		4.77%		3.34%
				
FOR		2threads	4threads	8threads
16threads
4.3.0		100.00%	100.00%	100.00%	100.00%
gomp3-futex	85.81%	62.76%	49.27%	99.48%
gomp3-block	8.41%		6.69%		4.80%		3.49%
				
PARALLEL	2threads	4threads	8threads
16threads
4.3.0		100.00%	100.00%	100.00%	100.00%
gomp3-futex	106.64%	66.15%	64.69%	92.23%
gomp3-block	17.75%	9.69%		8.82%		8.77%
				
SINGLE	2threads	4threads	8threads	16threads
4.3.0		100.00%	100.00%	100.00%	100.00%
gomp3-futex	69.32%	50.00%	56.04%	91.23%
gomp3-block	36.40%	34.61%	35.06%	60.34%
				
REDUCTION	2threads	4threads	8threads
16threads
4.3.0		100.00%	100.00%	100.00%	100.00%
gomp3-futex	106.01%	65.00%	65.47%	93.05%
gomp3-block	18.04%	12.17%	12.46%	11.76%

Using busywait instead of futex calls provides good improvements and
scaling.

Gomp3-futex is also faster than 4.3.0 in some cases. What other
improvements cause this? Does eliminating the malloc calls (in the
parallel speedup patch) account for most of the difference?

Thanks,

- Vasilis

-----Original Message-----
From: gcc-patches-owner@gcc.gnu.org
[mailto:gcc-patches-owner@gcc.gnu.org] On Behalf Of Jakub Jelinek
Sent: Wednesday, March 19, 2008 8:05 AM
To: gcc-patches@gcc.gnu.org
Cc: Ulrich Drepper
Subject: [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and
handle OMP_WAIT_POLICY

Hi!

I've been playing with runtime detection of the number of
do_wait spins per millisecond (see attached proglet), unfortunately
on some architectures (e.g. ppc G5) it is completely unreliable, on
others,
including x86_64, it is from time to time 10 times off.  But estimating
the spin count so that it is at most 10 times off at least on most CPUs
doesn't need any runtime detection, especially given that CPU
frequencies
hit the ceiling.
So, instead of letting users specify GOMP_BLOCKCOUNT as time in
milliseconds
to spin, this patch lets users specify GOMP_SPINCOUNT as number of
spins.
If this isn't specified, the default is 30g for OMP_WAIT_POLICY=active
(30g is roughly 5 minutes, could be 2 or 15 minutes depending on hw)
20m for no OMP_WAIT_POLICY (roughly 0.2 seconds) and 0 for
OMP_WAIT_POLICY=passive.  Additionally, the patch tracks number of
threads
currently managed by libgomp and if there are more libgomp managed
threads
than available CPUs, the spin counts decrease radically (1k for
OMP_WAIT_POLICY=active, 100 for no OMP_WAIT_POLICY).
The busy waiting duration is a hint anyway, so being not very precise is
IMHO not a big deal, but e.g. doing clock_gettime every few iterations
would
increase a latency a lot.

2008-03-19  Jakub Jelinek  <jakub@redhat.com>

	* libgomp.h (gomp_active_wait_policy): Remove decl.
	(gomp_throttled_spin_count_var, gomp_available_cpus,
	gomp_managed_threads): New extern decls.
	* team.c (gomp_team_start, gomp_team_end): If number of threads
	changed, adjust atomically gomp_managed_threads.
	* env.c (gomp_active_wait_policy, gomp_block_time_var): Remove.
	(gomp_throttled_spin_count_var, gomp_available_cpus,
	gomp_managed_threads): New variables.
	(parse_millis): Removed.
	(parse_spincount): New function.
	(parse_wait_policy): Return -1/0/1 instead of setting
	gomp_active_wait_policy.
	(initialize_env): Call gomp_init_num_threads unconditionally.
	Initialize gomp_available_cpus.  Call parse_spincount instead
	of parse_millis, initialize gomp_{,throttled_}spin_count_var
	depending on presence and value of OMP_WAIT_POLICY and
	GOMP_SPINCOUNT env vars.
	* config/linux/wait.h (do_wait): Use
gomp_throttled_spin_count_var
	instead of gomp_spin_count_var if gomp_managed_threads >
	gomp_available_cpus.

--- libgomp/team.c	(revision 133292)
+++ libgomp/team.c	(working copy)
@@ -287,8 +287,24 @@ gomp_team_start (void (*fn) (void *), vo
 	}
     }
 
+  if (__builtin_expect (nthreads > old_threads_used, 0))
+    {
+      long diff = (long) nthreads - (long) old_threads_used;
+
+      if (old_threads_used == 0)
+	--diff;
+
+#ifdef HAVE_SYNC_BUILTINS
+      __sync_fetch_and_add (&gomp_managed_threads, diff);
+#else
+      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_managed_threads += diff;
+      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+    }
+
   attr = &gomp_thread_attr;
-  if (gomp_cpu_affinity != NULL)
+  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
     {
       size_t stacksize;
       pthread_attr_init (&thread_attr);
@@ -328,7 +344,7 @@ gomp_team_start (void (*fn) (void *), vo
 	gomp_fatal ("Thread creation failed: %s", strerror (err));
     }
 
-  if (gomp_cpu_affinity != NULL)
+  if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
     pthread_attr_destroy (&thread_attr);
 
  do_release:
@@ -338,8 +354,20 @@ gomp_team_start (void (*fn) (void *), vo
      that should arrive back at the end of this team.  The extra
      threads should be exiting.  Note that we arrange for this test
      to never be true for nested teams.  */
-  if (nthreads < old_threads_used)
-    gomp_barrier_reinit (&gomp_threads_dock, nthreads);
+  if (__builtin_expect (nthreads < old_threads_used, 0))
+    {
+      long diff = (long) nthreads - (long) old_threads_used;
+
+      gomp_barrier_reinit (&gomp_threads_dock, nthreads);
+
+#ifdef HAVE_SYNC_BUILTINS
+      __sync_fetch_and_add (&gomp_managed_threads, diff);
+#else
+      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_managed_threads += diff;
+      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+    }
 }
 
 
@@ -357,6 +385,17 @@ gomp_team_end (void)
   gomp_end_task ();
   thr->ts = team->prev_ts;
 
+  if (__builtin_expect (thr->ts.team != NULL, 0))
+    {
+#ifdef HAVE_SYNC_BUILTINS
+      __sync_fetch_and_add (&gomp_managed_threads, 1L -
team->nthreads);
+#else
+      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_managed_threads -= team->nthreads - 1L;
+      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
+    }
+
   free_team (team);
 }
 
--- libgomp/env.c	(revision 133291)
+++ libgomp/env.c	(working copy)
@@ -57,7 +57,6 @@ struct gomp_task_icv gomp_global_icv = {
 };
 
 unsigned short *gomp_cpu_affinity;
-bool gomp_active_wait_policy = false;
 size_t gomp_cpu_affinity_len;
 unsigned long gomp_max_active_levels_var = INT_MAX;
 unsigned long gomp_thread_limit_var = ULONG_MAX;
@@ -65,8 +64,8 @@ unsigned long gomp_remaining_threads_cou
 #ifndef HAVE_SYNC_BUILTINS
 gomp_mutex_t gomp_remaining_threads_lock;
 #endif
-static unsigned long gomp_block_time_var;
-unsigned long long gomp_spin_count_var;
+unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
 
 /* Parse the OMP_SCHEDULE environment variable.  */
 
@@ -239,14 +238,14 @@ parse_stacksize (const char *name, unsig
   return false;
 }
 
-/* Parse the GOMP_BLOCKTIME environment varible.  Return true if one
was
+/* Parse the GOMP_SPINCOUNT environment varible.  Return true if one
was
    present and it was successfully parsed.  */
 
 static bool
-parse_millis (const char *name, unsigned long *pvalue)
+parse_spincount (const char *name, unsigned long long *pvalue)
 {
   char *env, *end;
-  unsigned long value, mult = 1;
+  unsigned long long value, mult = 1;
 
   env = getenv (name);
   if (env == NULL)
@@ -257,17 +256,16 @@ parse_millis (const char *name, unsigned
   if (*env == '\0')
     goto invalid;
 
-  if (strncasecmp (env, "infinite", 8) != 0
-      || strncasecmp (env, "infinity", 8) != 0
-      || strncasecmp (env, "unexpire", 8) != 0)
+  if (strncasecmp (env, "infinite", 8) == 0
+      || strncasecmp (env, "infinity", 8) == 0)
     {
-      value = ULONG_MAX;
+      value = ~0ULL;
       end = env + 8;
       goto check_tail;
     }
 
   errno = 0;
-  value = strtoul (env, &end, 10);
+  value = strtoull (env, &end, 10);
   if (errno)
     goto invalid;
 
@@ -277,17 +275,17 @@ parse_millis (const char *name, unsigned
     {
       switch (tolower (*end))
 	{
-	case 's':
-	  mult = 1000;
+	case 'k':
+	  mult = 1000LL;
 	  break;
 	case 'm':
-	  mult = 60 * 1000;
+	  mult = 1000LL * 1000LL;
 	  break;
-	case 'h':
-	  mult = 60 * 60 * 1000;
+	case 'g':
+	  mult = 1000LL * 1000LL * 1000LL;
 	  break;
-	case 'd':
-	  mult = 24 * 60 * 60 * 1000;
+	case 't':
+	  mult = 1000LL * 1000LL * 1000LL * 1000LL;
 	  break;
 	default:
 	  goto invalid;
@@ -300,8 +298,8 @@ parse_millis (const char *name, unsigned
 	goto invalid;
     }
 
-  if (value > ULONG_MAX / mult)
-    value = ULONG_MAX;
+  if (value > ~0ULL / mult)
+    value = ~0ULL;
   else
     value *= mult;
 
@@ -348,33 +346,36 @@ parse_boolean (const char *name, bool *v
 /* Parse the OMP_WAIT_POLICY environment variable and store the
    result in gomp_active_wait_policy.  */
 
-static void
+static int
 parse_wait_policy (void)
 {
   const char *env;
+  int ret = -1;
 
   env = getenv ("OMP_WAIT_POLICY");
   if (env == NULL)
-    return;
+    return -1;
 
   while (isspace ((unsigned char) *env))
     ++env;
   if (strncasecmp (env, "active", 6) == 0)
     {
-      gomp_active_wait_policy = true;
+      ret = 1;
       env += 6;
     }
   else if (strncasecmp (env, "passive", 7) == 0)
     {
-      gomp_active_wait_policy = false;
+      ret = 0;
       env += 7;
     }
   else
     env = "X";
   while (isspace ((unsigned char) *env))
     ++env;
-  if (*env != '\0')
-    gomp_error ("Invalid value for environment variable
OMP_WAIT_POLICY");
+  if (*env == '\0')
+    return ret;
+  gomp_error ("Invalid value for environment variable
OMP_WAIT_POLICY");
+  return -1;
 }
 
 /* Parse the GOMP_CPU_AFFINITY environment varible.  Return true if one
was
@@ -472,6 +473,7 @@ static void __attribute__((constructor))
 initialize_env (void)
 {
   unsigned long stacksize;
+  int wait_policy;
 
   /* Do a compile time check that mkomp_h.pl did good job.  */
   omp_check_defines ();
@@ -479,7 +481,6 @@ initialize_env (void)
   parse_schedule ();
   parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
-  parse_wait_policy ();
   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS",
&gomp_max_active_levels_var);
   parse_unsigned_long ("OMP_THREAD_LIMIT", &gomp_thread_limit_var);
   if (gomp_thread_limit_var != ULONG_MAX)
@@ -489,23 +490,34 @@ initialize_env (void)
       gomp_mutex_init (&gomp_remaining_threads_lock);
 #endif
     }
+  gomp_init_num_threads ();
+  gomp_available_cpus = gomp_global_icv.nthreads_var;
   if (!parse_unsigned_long ("OMP_NUM_THREADS",
&gomp_global_icv.nthreads_var))
-    gomp_init_num_threads ();
+    gomp_global_icv.nthreads_var = gomp_available_cpus;
   if (parse_affinity ())
     gomp_init_affinity ();
-  if (!parse_millis ("GOMP_BLOCKTIME", &gomp_block_time_var))
-    {
-      if (gomp_active_wait_policy)
-	gomp_block_time_var = 200; /* 200ms */
-    }
-  if (gomp_block_time_var > 0)
+  wait_policy = parse_wait_policy ();
+  if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
     {
-      if (gomp_block_time_var == ULONG_MAX)
-	gomp_spin_count_var = ~0ULL;
-      else
-	/* Estimate translation of gomp_block_time_var in milliseconds
to
-	   spin count.  */;
-    }
+      /* Using a rough estimation of 100000 spins per msec,
+	 use 5 min blocking for OMP_WAIT_POLICY=active,
+	 200 msec blocking when OMP_WAIT_POLICY is not specificed
+	 and 0 when OMP_WAIT_POLICY=passive.
+	 Depending on the CPU speed, this can be e.g. 5 times longer
+	 or 5 times shorter.  */
+      if (wait_policy > 0)
+	gomp_spin_count_var = 30000000000LL;
+      else if (wait_policy < 0)
+	gomp_spin_count_var = 20000000LL;
+    }
+  /* gomp_throttled_spin_count_var is used when there are more libgomp
+     managed threads than available CPUs.  Use very short spinning.  */
+  if (wait_policy > 0)
+    gomp_throttled_spin_count_var = 1000LL;
+  else if (wait_policy < 0)
+    gomp_throttled_spin_count_var = 100LL;
+  if (gomp_throttled_spin_count_var > gomp_spin_count_var)
+    gomp_throttled_spin_count_var = gomp_spin_count_var;
 
   /* Not strictly environment related, but ordering constructors is
tricky.  */
   pthread_attr_init (&gomp_thread_attr);
--- libgomp/libgomp.h	(revision 133305)
+++ libgomp/libgomp.h	(working copy)
@@ -190,8 +190,8 @@ extern unsigned long gomp_remaining_thre
 extern gomp_mutex_t gomp_remaining_threads_lock;
 #endif
 extern unsigned long gomp_max_active_levels_var;
-extern bool gomp_active_wait_policy;
-extern unsigned long long gomp_spin_count_var;
+extern unsigned long long gomp_spin_count_var,
gomp_throttled_spin_count_var;
+extern unsigned long gomp_available_cpus, gomp_managed_threads;
 
 /* This structure describes a "task" to be run by a thread.  At present
    we implement only synchronous tasks, i.e. no tasks are deferred or
--- libgomp/config/linux/wait.h	(revision 133339)
+++ libgomp/config/linux/wait.h	(working copy)
@@ -51,6 +51,8 @@ static inline void do_wait (int *addr, i
 {
   unsigned long long i, count = gomp_spin_count_var;
 
+  if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0))
+    count = gomp_throttled_spin_count_var;
   for (i = 0; i < count; i++)
     if (__builtin_expect (*addr != val, 0))
       return;

	Jakub

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2008-03-19 22:31 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-19 13:47 [gomp3] Remove GOMP_BLOCKTIME env var, add GOMP_SPINCOUNT and handle OMP_WAIT_POLICY Jakub Jelinek
2008-03-19 23:40 ` Liaskovitis, Vasileios

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).