diff -ru trunk-orig/libgomp/libgomp.h trunk-work/libgomp/libgomp.h --- trunk-orig/libgomp/libgomp.h 2018-12-20 23:24:20.040375724 +0800 +++ trunk-work/libgomp/libgomp.h 2018-12-21 23:32:47.938628954 +0800 @@ -937,6 +937,13 @@ #include "splay-tree.h" +struct goacc_asyncqueue_desc +{ + int nasyncqueue; + struct goacc_asyncqueue **asyncqueue; + struct goacc_asyncqueue_list *active; +}; + typedef struct acc_dispatch_t { /* This is a linked list of data mapped using the @@ -955,10 +962,7 @@ *destroy_thread_data_func; struct { - gomp_mutex_t lock; - int nasyncqueue; - struct goacc_asyncqueue **asyncqueue; - struct goacc_asyncqueue_list *active; + struct goacc_asyncqueue_desc *aqdesc; __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func; __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func; diff -ru trunk-orig/libgomp/oacc-async.c trunk-work/libgomp/oacc-async.c --- trunk-orig/libgomp/oacc-async.c 2018-12-18 22:19:51.923102938 +0800 +++ trunk-work/libgomp/oacc-async.c 2018-12-21 23:40:36.088528890 +0800 @@ -70,45 +70,84 @@ struct gomp_device_descr *dev = thr->dev; - gomp_mutex_lock (&dev->openacc.async.lock); + struct goacc_asyncqueue_desc *aqdesc = dev->openacc.async.aqdesc; - if (!create - && (async >= dev->openacc.async.nasyncqueue - || !dev->openacc.async.asyncqueue[async])) - { - gomp_mutex_unlock (&dev->openacc.async.lock); - return NULL; - } + if (!create) + return async < aqdesc->nasyncqueue ? aqdesc->asyncqueue[async] : NULL; - if (async >= dev->openacc.async.nasyncqueue) - { - int diff = async + 1 - dev->openacc.async.nasyncqueue; - dev->openacc.async.asyncqueue - = gomp_realloc (dev->openacc.async.asyncqueue, - sizeof (goacc_aq) * (async + 1)); - memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue, - 0, sizeof (goacc_aq) * diff); - dev->openacc.async.nasyncqueue = async + 1; - } + if (async < aqdesc->nasyncqueue && aqdesc->asyncqueue[async]) + return aqdesc->asyncqueue[async]; - if (!dev->openacc.async.asyncqueue[async]) - { - dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func (); + struct goacc_asyncqueue *new_aq = dev->openacc.async.construct_func (); + const int inc_size = 8; + int new_sz = async + 1 + inc_size; + new_sz = (new_sz + inc_size - 1) & ~(inc_size - 1); + + struct goacc_asyncqueue **new_aqvec = + gomp_malloc_cleared (sizeof (goacc_aq) * new_sz); + new_aqvec[async] = new_aq; + + struct goacc_asyncqueue_desc *ndesc = + gomp_malloc (sizeof (struct goacc_asyncqueue_desc)); + ndesc->asyncqueue = new_aqvec; + ndesc->nasyncqueue = new_sz; + + goacc_aq_list nl = gomp_malloc (sizeof (struct goacc_asyncqueue_list)); + nl->aq = new_aq; + ndesc->active = nl; + + /* Loop to atomically create and add a new async queue, without using a lock, + though may possibly requiring creating and replacing with a new descriptor. + This may likely not be as optimized as it can be, although simplifies + other handling of async queues. */ + do { + int curr_num_aqs = aqdesc->nasyncqueue; + + if (async < curr_num_aqs) + { + /* Someone else expanded the asyncqeue array to enough size, free the + allocated resources. */ + free (ndesc->asyncqueue); + free (ndesc); + + /* Try to set the new aq to the right place again. */ + struct goacc_asyncqueue_desc *nullq = NULL; + if (__atomic_compare_exchange (&aqdesc->asyncqueue[async], + &nullq, &new_aq, false, + __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) + { + /* Push new asyncqueue to front of active list. */ + nl->next = aqdesc->active; + while (!__atomic_compare_exchange (&aqdesc->active, &nl->next, &nl, + true, __ATOMIC_ACQ_REL, + __ATOMIC_ACQUIRE)) + ; + } + else + { + /* Okay, someone else already created 'async', free/destroy + everything allocated and return existing queue directly. */ + if (!dev->openacc.async.destruct_func (new_aq)) + gomp_fatal ("error in async queue creation"); + free (nl); + } + return aqdesc->asyncqueue[async]; + } - if (!dev->openacc.async.asyncqueue[async]) - { - gomp_mutex_unlock (&dev->openacc.async.lock); - gomp_fatal ("async %d creation failed", async); - } - - /* Link new async queue into active list. */ - goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list)); - n->aq = dev->openacc.async.asyncqueue[async]; - n->next = dev->openacc.async.active; - dev->openacc.async.active = n; - } - gomp_mutex_unlock (&dev->openacc.async.lock); - return dev->openacc.async.asyncqueue[async]; + /* Copy current asyncqueue contents*/ + memcpy (ndesc->asyncqueue, aqdesc->asyncqueue, + sizeof (struct goacc_asyncqueue *) * curr_num_aqs); + ndesc->active->next = aqdesc->active; + + } while (!__atomic_compare_exchange (&dev->openacc.async.aqdesc, + &aqdesc, &ndesc, false, + __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)); + + /* Free the old stale descriptor and resources in aqdesc. */ + free (aqdesc->asyncqueue); + free (aqdesc); + + return dev->openacc.async.aqdesc->asyncqueue[async]; } attribute_hidden struct goacc_asyncqueue * @@ -139,14 +178,12 @@ struct goacc_thread *thr = get_goacc_thread (); int ret = 1; - gomp_mutex_lock (&thr->dev->openacc.async.lock); - for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next) + for (goacc_aq_list l = thr->dev->openacc.async.aqdesc->active; l; l = l->next) if (!thr->dev->openacc.async.test_func (l->aq)) { ret = 0; break; } - gomp_mutex_unlock (&thr->dev->openacc.async.lock); return ret; } @@ -194,10 +231,8 @@ { struct gomp_device_descr *dev = get_goacc_thread_device (); - gomp_mutex_lock (&dev->openacc.async.lock); - for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next) + for (goacc_aq_list l = dev->openacc.async.aqdesc->active; l; l = l->next) dev->openacc.async.synchronize_func (l->aq); - gomp_mutex_unlock (&dev->openacc.async.lock); } /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all. */ @@ -221,14 +256,12 @@ goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async); - gomp_mutex_lock (&thr->dev->openacc.async.lock); - for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next) + for (goacc_aq_list l = thr->dev->openacc.async.aqdesc->active; l; l = l->next) { thr->dev->openacc.async.synchronize_func (l->aq); if (waiting_queue) thr->dev->openacc.async.serialize_func (l->aq, waiting_queue); } - gomp_mutex_unlock (&thr->dev->openacc.async.lock); } int @@ -261,30 +294,27 @@ attribute_hidden void goacc_init_asyncqueues (struct gomp_device_descr *devicep) { - gomp_mutex_init (&devicep->openacc.async.lock); - devicep->openacc.async.nasyncqueue = 0; - devicep->openacc.async.asyncqueue = NULL; - devicep->openacc.async.active = NULL; + devicep->openacc.async.aqdesc = + gomp_malloc_cleared (sizeof (struct goacc_asyncqueue_desc)); } attribute_hidden bool goacc_fini_asyncqueues (struct gomp_device_descr *devicep) { + struct goacc_asyncqueue_desc *aqdesc = devicep->openacc.async.aqdesc; bool ret = true; - if (devicep->openacc.async.nasyncqueue > 0) + + if (aqdesc->nasyncqueue > 0) { goacc_aq_list next; - for (goacc_aq_list l = devicep->openacc.async.active; l; l = next) + for (goacc_aq_list l = aqdesc->active; l; l = next) { ret &= devicep->openacc.async.destruct_func (l->aq); next = l->next; free (l); } - free (devicep->openacc.async.asyncqueue); - devicep->openacc.async.nasyncqueue = 0; - devicep->openacc.async.asyncqueue = NULL; - devicep->openacc.async.active = NULL; + free (aqdesc->asyncqueue); } - gomp_mutex_destroy (&devicep->openacc.async.lock); + free (aqdesc); return ret; } diff -ru trunk-orig/libgomp/oacc-cuda.c trunk-work/libgomp/oacc-cuda.c --- trunk-orig/libgomp/oacc-cuda.c 2018-12-18 18:09:08.601913507 +0800 +++ trunk-work/libgomp/oacc-cuda.c 2018-12-21 23:34:10.024738834 +0800 @@ -87,9 +87,7 @@ if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func) { goacc_aq aq = get_goacc_asyncqueue (async); - gomp_mutex_lock (&thr->dev->openacc.async.lock); ret = thr->dev->openacc.cuda.set_stream_func (aq, stream); - gomp_mutex_unlock (&thr->dev->openacc.async.lock); } return ret;