public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [gomp3] #pragma omp parallel ; speedup
@ 2008-03-17 18:14 Jakub Jelinek
  0 siblings, 0 replies; only message in thread
From: Jakub Jelinek @ 2008-03-17 18:14 UTC (permalink / raw)
  To: gcc-patches; +Cc: Ulrich Drepper

Hi!

The following patch speeds up #pragma omp parallel when the threads are
already created.
On the microbenchmark I've posted on Saturday the speedup with
GOMP_BLOCKTIME=infinity is from roughly
barrier bench 0.99152 seconds
parallel bench 2.31152 seconds
static bench 0.115849 seconds
dynamic bench 0.555459 seconds
to
barrier bench 0.989482 seconds
parallel bench 0.694825 seconds
static bench 0.0365596 seconds
dynamic bench 0.493667 seconds
, on realworld benchmarks the speedup is less dramatic, but still
noticeable.  The speedup is mainly from doing just one malloc/free
per
#pragma omp parallel
;
rather than 3, where one of those (task) was especially bad because
freeing was done in different thread from where it was allocated.
There is no reason why the implicit task needs to be freed (similarly
for if (0) tasks the task structure can be kept on GOMP_task function's
stack, as the invoking task can't be scheduled until the if (0) task
finished.  Most of the parallel regions have only very few workshares
concurrently active, so allocating the initial 4 slots for work_shares
inside of gomp_team allows getting rid of extra allocation/deallocation
in the common case.

Committed to gomp-3_0-branch.

2008-03-17  Jakub Jelinek  <jakub@redhat.com>

	* libgomp.h (struct gomp_team): Change ordered_release field
	into gomp_sem_t ** from flexible array member.  Add implicit_task
	and initial_work_shares fields.
	(gomp_new_task): Removed.
	(gomp_init_task): New prototype.
	* team.c (new_team): Allocate implicit_task for each thread
	and initial work_shares together with gomp_team allocation.
	(free_team): Only free work_shares if it is not init_work_shares.
	(gomp_team_start): Use gomp_init_task instead of gomp_new_task,
	set thr->task to the corresponding implicit_task array entry.
	* task.c (gomp_new_task): Removed.
	(gomp_init_task): New function.
	(gomp_end_task): Don't free the task.
	(GOMP_task): Allocate struct gomp_task on the stack, call
	gomp_init_task rather than gomp_new_task.
	* work.c (gomp_work_share_start): If work_shares ==
	init_work_shares, gomp_malloc + memcpy rather than gomp_realloc.

--- libgomp/libgomp.h	(revision 133291)
+++ libgomp/libgomp.h	(working copy)
@@ -164,6 +164,40 @@ struct gomp_team_state
   unsigned long static_trip;
 };
 
+/* These are the OpenMP 3.0 Internal Control Variables described in
+   section 2.3.1.  Those described as having one copy per task are
+   stored within the structure; those described as having one copy
+   for the whole program are (naturally) global variables.  */
+
+struct gomp_task_icv
+{
+  unsigned long nthreads_var;
+  enum gomp_schedule_type run_sched_var;
+  int run_sched_modifier;
+  bool dyn_var;
+  bool nest_var;
+};
+
+extern struct gomp_task_icv gomp_global_icv;
+extern unsigned long gomp_thread_limit_var;
+extern unsigned long gomp_remaining_threads_count;
+#ifndef HAVE_SYNC_BUILTINS
+extern gomp_mutex_t gomp_remaining_threads_lock;
+#endif
+extern unsigned long gomp_max_active_levels_var;
+extern bool gomp_active_wait_policy;
+extern unsigned long long gomp_spin_count_var;
+
+/* This structure describes a "task" to be run by a thread.  At present
+   we implement only synchronous tasks, i.e. no tasks are deferred or
+   untied.  As such, all we need is the state of the ICVs.  */
+
+struct gomp_task
+{
+  struct gomp_task *prev;
+  struct gomp_task_icv icv;
+};
+
 /* This structure describes a "team" of threads.  These are the threads
    that are spawned by a PARALLEL constructs, as well as the work sharing
    constructs that the team encounters.  */
@@ -200,46 +234,17 @@ struct gomp_team
      parallels, as the master is a member of two teams.  */
   gomp_sem_t master_release;
 
-  /* This barrier is used for most synchronization of the team.  */
-  gomp_barrier_t barrier;
-
-  /* This array contains pointers to the release semaphore of the threads
-     in the team.  */
-  gomp_sem_t *ordered_release[];
-};
+  /* This points to an array with pointers to the release semaphore
+     of the threads in the team.  */
+  gomp_sem_t **ordered_release;
 
-/* These are the OpenMP 3.0 Internal Control Variables described in
-   section 2.3.1.  Those described as having one copy per task are
-   stored within the structure; those described as having one copy
-   for the whole program are (naturally) global variables.  */
-
-struct gomp_task_icv
-{
-  unsigned long nthreads_var;
-  enum gomp_schedule_type run_sched_var;
-  int run_sched_modifier;
-  bool dyn_var;
-  bool nest_var;
-};
+  struct gomp_work_share *init_work_shares[4];
 
-extern struct gomp_task_icv gomp_global_icv;
-extern unsigned long gomp_thread_limit_var;
-extern unsigned long gomp_remaining_threads_count;
-#ifndef HAVE_SYNC_BUILTINS
-extern gomp_mutex_t gomp_remaining_threads_lock;
-#endif
-extern unsigned long gomp_max_active_levels_var;
-extern bool gomp_active_wait_policy;
-extern unsigned long long gomp_spin_count_var;
-
-/* This structure describes a "task" to be run by a thread.  At present
-   we implement only synchronous tasks, i.e. no tasks are deferred or
-   untied.  As such, all we need is the state of the ICVs.  */
+  /* This barrier is used for most synchronization of the team.  */
+  gomp_barrier_t barrier;
 
-struct gomp_task
-{
-  struct gomp_task *prev;
-  struct gomp_task_icv icv;
+  /* This array contains structures for implicit tasks.  */
+  struct gomp_task implicit_task[];
 };
 
 /* This structure contains all data that is private to libgomp and is
@@ -352,8 +357,8 @@ extern unsigned gomp_dynamic_max_threads
 
 /* task.c */
 
-extern struct gomp_task *gomp_new_task (struct gomp_task *,
-					struct gomp_task_icv *);
+extern void gomp_init_task (struct gomp_task *, struct gomp_task *,
+			    struct gomp_task_icv *);
 extern void gomp_end_task (void);
 
 /* team.c */
--- libgomp/team.c	(revision 133291)
+++ libgomp/team.c	(working copy)
@@ -144,20 +144,22 @@ new_team (unsigned nthreads, struct gomp
   struct gomp_team *team;
   size_t size;
 
-  size = sizeof (*team) + nthreads * sizeof (team->ordered_release[0]);
+  size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0])
+				      + sizeof (team->implicit_task[0]));
   team = gomp_malloc (size);
   gomp_mutex_init (&team->work_share_lock);
 
-  team->work_shares = gomp_malloc (4 * sizeof (struct gomp_work_share *));
+  team->work_shares = team->init_work_shares;
   team->generation_mask = 3;
   team->oldest_live_gen = work_share == NULL;
   team->num_live_gen = work_share != NULL;
-  team->work_shares[0] = work_share;
+  team->init_work_shares[0] = work_share;
 
   team->nthreads = nthreads;
   gomp_barrier_init (&team->barrier, nthreads);
 
   gomp_sem_init (&team->master_release, 0);
+  team->ordered_release = (void *) &team->implicit_task[nthreads];
   team->ordered_release[0] = &team->master_release;
 
   return team;
@@ -169,7 +171,8 @@ new_team (unsigned nthreads, struct gomp
 static void
 free_team (struct gomp_team *team)
 {
-  free (team->work_shares);
+  if (__builtin_expect (team->work_shares != team->init_work_shares, 0))
+    free (team->work_shares);
   gomp_mutex_destroy (&team->work_share_lock);
   gomp_barrier_destroy (&team->barrier);
   gomp_sem_destroy (&team->master_release);
@@ -212,7 +215,8 @@ gomp_team_start (void (*fn) (void *), vo
     ++thr->ts.active_level;
   thr->ts.work_share_generation = 0;
   thr->ts.static_trip = 0;
-  thr->task = gomp_new_task (task, icv);
+  thr->task = &team->implicit_task[0];
+  gomp_init_task (thr->task, task, icv);
 
   if (nthreads == 1)
     return;
@@ -260,7 +264,8 @@ gomp_team_start (void (*fn) (void *), vo
 	  nthr->ts.active_level = thr->ts.active_level;
 	  nthr->ts.work_share_generation = 0;
 	  nthr->ts.static_trip = 0;
-	  nthr->task = gomp_new_task (task, icv);
+	  nthr->task = &team->implicit_task[i];
+	  gomp_init_task (nthr->task, task, icv);
 	  nthr->fn = fn;
 	  nthr->data = data;
 	  team->ordered_release[i] = &nthr->release;
@@ -311,7 +316,8 @@ gomp_team_start (void (*fn) (void *), vo
       start_data->ts.active_level = thr->ts.active_level;
       start_data->ts.work_share_generation = 0;
       start_data->ts.static_trip = 0;
-      start_data->task = gomp_new_task (task, icv);
+      start_data->task = &team->implicit_task[i];
+      gomp_init_task (start_data->task, task, icv);
       start_data->nested = nested;
 
       if (gomp_cpu_affinity != NULL)
--- libgomp/task.c	(revision 133291)
+++ libgomp/task.c	(working copy)
@@ -34,14 +34,12 @@
 
 /* Create a new task data structure.  */
 
-struct gomp_task *
-gomp_new_task (struct gomp_task *prev_task, struct gomp_task_icv *prev_icv)
+void
+gomp_init_task (struct gomp_task *task, struct gomp_task *prev_task,
+		struct gomp_task_icv *prev_icv)
 {
-  struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
   task->prev = prev_task;
   task->icv = *prev_icv;
-
-  return task;
 }
 
 /* Clean up and free a task, after completing it.  */
@@ -53,7 +51,6 @@ gomp_end_task (void)
   struct gomp_task *task = thr->task;
 
   thr->task = task->prev;
-  free (task);
 }
 
 /* Called when encountering an explicit task directive.  If IF_CLAUSE is
@@ -66,7 +63,9 @@ GOMP_task (void (*fn) (void *), void *da
 	   unsigned flags __attribute__((unused)))
 {
   struct gomp_thread *thr = gomp_thread ();
-  thr->task = gomp_new_task (thr->task, gomp_icv ());
+  struct gomp_task task;
+  gomp_init_task (&task, thr->task, gomp_icv ());
+  thr->task = &task;
 
   /* We only implement synchronous tasks at the moment, which means that
      we cannot defer or untie the task.  Which means we execute it now.  */
--- libgomp/work.c	(revision 133291)
+++ libgomp/work.c	(working copy)
@@ -109,9 +109,17 @@ gomp_work_share_start (bool ordered)
   /* Resize the work shares queue if we've run out of space.  */
   if (team->num_live_gen++ == team->generation_mask)
     {
-      team->work_shares = gomp_realloc (team->work_shares,
-					2 * team->num_live_gen
-					* sizeof (*team->work_shares));
+      if (team->work_shares == team->init_work_shares)
+	{
+	  team->work_shares = gomp_malloc (2 * team->num_live_gen
+					   * sizeof (*team->work_shares));
+	  memcpy (team->work_shares, team->init_work_shares,
+		  sizeof (team->init_work_shares));
+	}
+      else
+	team->work_shares = gomp_realloc (team->work_shares,
+					  2 * team->num_live_gen
+					  * sizeof (*team->work_shares));
 
       /* Unless oldest_live_gen is zero, the sequence of live elements
 	 wraps around the end of the array.  If we do nothing, we break

	Jakub

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2008-03-17 16:21 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-17 18:14 [gomp3] #pragma omp parallel ; speedup Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).