* New benchtest: pthread locks @ 2020-10-08 5:34 DJ Delorie 2020-10-10 3:55 ` [v2] " DJ Delorie 0 siblings, 1 reply; 8+ messages in thread From: DJ Delorie @ 2020-10-08 5:34 UTC (permalink / raw) To: libc-alpha Performance benchmarks for various posix locks: mutex, rwlock, spinlock, condvar, and semaphore. Each test is performed with an empty loop body or with a computationally "interesting" (i.e. difficult to optimize away, and used just to allow lock code to be "hidden" in the filler's CPU cycles). diff --git a/benchtests/Makefile b/benchtests/Makefile index 922e2a94b1..36ea224d9d 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -31,7 +31,7 @@ ifneq (,$(filter yes,$(float128-fcts) $(float128-alias-fcts))) bench-math += expf128 powf128 sinf128 endif -bench-pthread := pthread_once thread_create +bench-pthread := pthread_once thread_create pthread-locks bench-string := ffs ffsll diff --git a/benchtests/bench-pthread-locks.c b/benchtests/bench-pthread-locks.c new file mode 100644 index 0000000000..cc866dd5ad --- /dev/null +++ b/benchtests/bench-pthread-locks.c @@ -0,0 +1,502 @@ +/* Measure various lock acquisition times for empty critical sections. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define TEST_MAIN +#define TEST_NAME "pthread-locks" + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdatomic.h> +#include <sys/time.h> +#include "bench-timing.h" +#include "json-lib.h" + +/* The point of this benchmark is to measure the overhead of an empty + critical section or a small critical section. This is never going + to be indicative of real application performance. Instead we are + trying to benchmark the effects of the compiler and the runtime + coupled with a particular set of hardware atomic operations. + The numbers from this benchmark should be taken with a massive gain + of salt and viewed through the eyes of expert reviewers. */ + +static pthread_mutex_t m; +static pthread_rwlock_t rw; +static pthread_cond_t cv; +static pthread_cond_t consumer_c, producer_c; +static int cv_done; +static pthread_spinlock_t sp; +static sem_t sem; + +typedef timing_t (*test_t)(long, int); + +#define START_ITERS 1000 + +#define FILLER_GOES_HERE \ + if (filler) \ + do_filler (); + +/* Everyone loves a good fibonacci series. This isn't quite one of + them because we need larger values in fewer steps, in a way that + won't be optimized away. We're looking to approximately double the + total time each test iteration takes, so as to not swamp the useful + timings. */ + +#pragma GCC push_options +#pragma GCC optimize(1) + +static int __attribute__((noinline)) +fibonacci (int i) +{ + asm(""); + if (i > 2) + return fibonacci (i-1) + fibonacci (i-2); + return 10+i; +} + +static void +do_filler (void) +{ + static char buf1[512], buf2[512]; + int f = fibonacci (5); + memcpy (buf1, buf2, f); +} + +#pragma GCC pop_options + +static timing_t +test_mutex (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_mutex_init (&m, NULL); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_mutex_lock (&m); + FILLER_GOES_HERE; + pthread_mutex_unlock (&m); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_mutex_trylock (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_mutex_init (&m, NULL); + pthread_mutex_lock (&m); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_mutex_trylock (&m); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_mutex_unlock (&m); + return cur; +} + +static timing_t +test_rwlock_read (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_rwlock_init (&rw, NULL); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_rwlock_rdlock (&rw); + FILLER_GOES_HERE; + pthread_rwlock_unlock (&rw); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_rwlock_tryread (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_rwlock_init (&rw, NULL); + pthread_rwlock_wrlock (&rw); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_rwlock_tryrdlock (&rw); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_rwlock_unlock (&rw); + return cur; +} + +static timing_t +test_rwlock_write (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_rwlock_init (&rw, NULL); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_rwlock_wrlock (&rw); + FILLER_GOES_HERE; + pthread_rwlock_unlock (&rw); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_rwlock_trywrite (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_rwlock_init (&rw, NULL); + pthread_rwlock_rdlock (&rw); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_rwlock_trywrlock (&rw); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_rwlock_unlock (&rw); + return cur; +} + +static timing_t +test_spin_lock (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_spin_lock (&sp); + FILLER_GOES_HERE; + pthread_spin_unlock (&sp); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_spin_trylock (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE); + pthread_spin_lock (&sp); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_spin_trylock (&sp); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_spin_unlock (&sp); + return cur; +} + +static timing_t +test_sem_wait (long iters, int filler) +{ + timing_t start, stop, cur; + + sem_init (&sem, 0, 1); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + sem_post (&sem); + FILLER_GOES_HERE; + sem_wait (&sem); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_sem_trywait (long iters, int filler) +{ + timing_t start, stop, cur; + + sem_init (&sem, 0, 0); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + sem_trywait (&sem); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static void * +test_condvar_helper (void *v) +{ + /* This is wasteful, but the alternative is to add the overhead of a + mutex lock/unlock to the overall iteration (both threads) and we + don't want that. Ideally, this thread would run on an + independent processing core anyway. The ONLY goal here is to + minimize the time the other thread spends waiting for us. */ + while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0) + pthread_cond_signal (&cv); + + return NULL; +} + +static timing_t +test_condvar (long iters, int filler) +{ + timing_t start, stop, cur; + pthread_t helper_id; + + pthread_mutex_init (&m, NULL); + pthread_cond_init (&cv, NULL); + pthread_mutex_lock (&m); + + __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED); + pthread_create (&helper_id, NULL, test_condvar_helper, &iters); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_cond_wait (&cv, &m); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_mutex_unlock (&m); + __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED); + + pthread_join (helper_id, NULL); + return cur; +} + +/* How many items are "queued" in our pretend queue. */ +static int queued = 0; + +typedef struct Producer_Params { + long iters; + int filler; +} Producer_Params; + +/* We only benchmark the consumer thread, but both threads are doing + essentially the same thing, and never run in parallel due to the + locks. Thus, even if they run on separate processing cores, we + count the time for both threads. */ +static void * +test_producer_thread (void *v) +{ + Producer_Params *p = (Producer_Params *) v; + long iters = p->iters; + int filler = p->filler; + long j; + + for (j = iters; j >= 0; --j) + { + /* Aquire lock on the queue. */ + pthread_mutex_lock (&m); + /* if something's already there, wait. */ + while (queued > 0) + pthread_cond_wait (&consumer_c, &m); + + /* Put something on the queue */ + FILLER_GOES_HERE; + ++ queued; + pthread_cond_signal (&producer_c); + + /* Give the other thread a chance to run. */ + pthread_mutex_unlock (&m); + } + + return NULL; +} + +static timing_t +test_consumer_producer (long iters, int filler) +{ + timing_t start, stop, cur; + pthread_t helper_id; + Producer_Params p; + + p.iters = iters; + p.filler = filler; + + pthread_mutex_init (&m, NULL); + pthread_cond_init (&cv, NULL); + + pthread_create (&helper_id, NULL, test_producer_thread, &p); + + TIMING_NOW (start); + + for (long j = iters; j >= 0; --j) + { + /* Aquire lock on the queue. */ + pthread_mutex_lock (&m); + /* Wait for something to be on the queue. */ + while (queued == 0) + pthread_cond_wait (&producer_c, &m); + + /* Take if off. */ + FILLER_GOES_HERE; + -- queued; + pthread_cond_signal (&consumer_c); + + /* Give the other thread a chance to run. */ + pthread_mutex_unlock (&m); + } + + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + + pthread_join (helper_id, NULL); + return cur; +} + +static int +do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *j) +{ + timing_t cur; + struct timeval ts, te; + double tsd, ted, td; + long iters, iters_limit; + + iters = START_ITERS; + iters_limit = LONG_MAX / 10; + + while (1) { + gettimeofday (&ts, NULL); + cur = func(iters, filler); + gettimeofday (&te, NULL); + + /* We want a test to take at least 0.1 seconds, and try + increasingly larger iteration counts until it does. This + allows for approximately constant-time tests regardless of + hardware speed, without the overhead of checking the time + inside the test loop itself. */ + + /* Note that this also primes the CPU cache and triggers faster + MHz, we hope. */ + tsd = ts.tv_sec + ts.tv_usec / 1000000.0; + ted = te.tv_sec + te.tv_usec / 1000000.0; + td = ted - tsd; + if (td >= 0.1 || iters >= iters_limit) + break; + + iters *= 10; + } + + json_attr_object_begin (j, filler ? "filler" : "empty"); + + json_attr_double (j, "duration", (double) iters); + json_attr_double (j, "iterations", (double) iters); + json_attr_double (j, "mean", (double) cur / (double) iters); + + json_attr_object_end (j); + + return 0; +} + +static int +do_bench_1 (const char *name, test_t func, json_ctx_t *j) +{ + int rv = 0; + + json_attr_object_begin (j, name); + + rv += do_bench_2 (name, func, 0, j); + rv += do_bench_2 (name, func, 1, j); + + json_attr_object_end (j); + + return rv; +} + +int +do_bench (void) +{ + int rv = 0; + json_ctx_t json_ctx; + + json_init (&json_ctx, 2, stdout); + json_attr_object_begin (&json_ctx, "pthread_locks"); + +#define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx) + + BENCH (mutex); + BENCH (mutex_trylock); + BENCH (rwlock_read); + BENCH (rwlock_tryread); + BENCH (rwlock_write); + BENCH (rwlock_trywrite); + BENCH (spin_lock); + BENCH (spin_trylock); + BENCH (sem_wait); + BENCH (sem_trywait); + BENCH (condvar); + BENCH (consumer_producer); + + json_attr_object_end (&json_ctx); + + return rv; +} + + +#define TEST_FUNCTION do_bench () + +#include "../test-skeleton.c" ^ permalink raw reply [flat|nested] 8+ messages in thread
* [v2] New benchtest: pthread locks 2020-10-08 5:34 New benchtest: pthread locks DJ Delorie @ 2020-10-10 3:55 ` DJ Delorie 2020-10-12 7:05 ` Paul Zimmermann 2020-10-14 1:27 ` Siddhesh Poyarekar 0 siblings, 2 replies; 8+ messages in thread From: DJ Delorie @ 2020-10-10 3:55 UTC (permalink / raw) To: libc-alpha v2 changes do_bench_2 to run multiple passes and provide a mean and standard deviation as well as some other statistics, since I got tired of running the whole benchmark multiple times and computing them manually ;-) From 94653e93eec9403ebf61b00179799751ce12088d Mon Sep 17 00:00:00 2001 From: DJ Delorie <dj@redhat.com> Date: Wed, 7 Oct 2020 17:04:12 -0400 Subject: New benchtest: pthread locks Performance benchmarks for various posix locks: mutex, rwlock, spinlock, condvar, and semaphore. Each test is performed with an empty loop body or with a computationally "interesting" (i.e. difficult to optimize away, and used just to allow lock code to be "hidden" in the filler's CPU cycles). diff --git a/benchtests/Makefile b/benchtests/Makefile index 922e2a94b1..5cd211ee9a 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -31,7 +31,7 @@ ifneq (,$(filter yes,$(float128-fcts) $(float128-alias-fcts))) bench-math += expf128 powf128 sinf128 endif -bench-pthread := pthread_once thread_create +bench-pthread := pthread_once thread_create pthread-locks bench-string := ffs ffsll @@ -109,6 +109,7 @@ $(addprefix $(objpfx)bench-,$(bench-math)): $(libm) $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm) $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library) $(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library) +$(addprefix $(objpfx)bench-,pthread-locks): $(libm) \f diff --git a/benchtests/bench-pthread-locks.c b/benchtests/bench-pthread-locks.c new file mode 100644 index 0000000000..2bd49d8762 --- /dev/null +++ b/benchtests/bench-pthread-locks.c @@ -0,0 +1,554 @@ +/* Measure various lock acquisition times for empty critical sections. + Copyright (C) 2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define TEST_MAIN +#define TEST_NAME "pthread-locks" + +#include <stdio.h> +#include <string.h> +#include <limits.h> +#include <stdlib.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdatomic.h> +#include <sys/time.h> +#include <math.h> +#include "bench-timing.h" +#include "json-lib.h" + +/* The point of this benchmark is to measure the overhead of an empty + critical section or a small critical section. This is never going + to be indicative of real application performance. Instead we are + trying to benchmark the effects of the compiler and the runtime + coupled with a particular set of hardware atomic operations. + The numbers from this benchmark should be taken with a massive gain + of salt and viewed through the eyes of expert reviewers. */ + +static pthread_mutex_t m; +static pthread_rwlock_t rw; +static pthread_cond_t cv; +static pthread_cond_t consumer_c, producer_c; +static int cv_done; +static pthread_spinlock_t sp; +static sem_t sem; + +typedef timing_t (*test_t)(long, int); + +#define START_ITERS 1000 + +#define FILLER_GOES_HERE \ + if (filler) \ + do_filler (); + +/* Everyone loves a good fibonacci series. This isn't quite one of + them because we need larger values in fewer steps, in a way that + won't be optimized away. We're looking to approximately double the + total time each test iteration takes, so as to not swamp the useful + timings. */ + +#pragma GCC push_options +#pragma GCC optimize(1) + +static int __attribute__((noinline)) +fibonacci (int i) +{ + asm(""); + if (i > 2) + return fibonacci (i-1) + fibonacci (i-2); + return 10+i; +} + +static void +do_filler (void) +{ + static char buf1[512], buf2[512]; + int f = fibonacci (5); + memcpy (buf1, buf2, f); +} + +#pragma GCC pop_options + +static timing_t +test_mutex (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_mutex_init (&m, NULL); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_mutex_lock (&m); + FILLER_GOES_HERE; + pthread_mutex_unlock (&m); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_mutex_trylock (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_mutex_init (&m, NULL); + pthread_mutex_lock (&m); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_mutex_trylock (&m); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_mutex_unlock (&m); + return cur; +} + +static timing_t +test_rwlock_read (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_rwlock_init (&rw, NULL); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_rwlock_rdlock (&rw); + FILLER_GOES_HERE; + pthread_rwlock_unlock (&rw); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_rwlock_tryread (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_rwlock_init (&rw, NULL); + pthread_rwlock_wrlock (&rw); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_rwlock_tryrdlock (&rw); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_rwlock_unlock (&rw); + return cur; +} + +static timing_t +test_rwlock_write (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_rwlock_init (&rw, NULL); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_rwlock_wrlock (&rw); + FILLER_GOES_HERE; + pthread_rwlock_unlock (&rw); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_rwlock_trywrite (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_rwlock_init (&rw, NULL); + pthread_rwlock_rdlock (&rw); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_rwlock_trywrlock (&rw); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_rwlock_unlock (&rw); + return cur; +} + +static timing_t +test_spin_lock (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_spin_lock (&sp); + FILLER_GOES_HERE; + pthread_spin_unlock (&sp); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_spin_trylock (long iters, int filler) +{ + timing_t start, stop, cur; + + pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE); + pthread_spin_lock (&sp); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_spin_trylock (&sp); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_spin_unlock (&sp); + return cur; +} + +static timing_t +test_sem_wait (long iters, int filler) +{ + timing_t start, stop, cur; + + sem_init (&sem, 0, 1); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + sem_post (&sem); + FILLER_GOES_HERE; + sem_wait (&sem); + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static timing_t +test_sem_trywait (long iters, int filler) +{ + timing_t start, stop, cur; + + sem_init (&sem, 0, 0); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + sem_trywait (&sem); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + return cur; +} + +static void * +test_condvar_helper (void *v) +{ + /* This is wasteful, but the alternative is to add the overhead of a + mutex lock/unlock to the overall iteration (both threads) and we + don't want that. Ideally, this thread would run on an + independent processing core anyway. The ONLY goal here is to + minimize the time the other thread spends waiting for us. */ + while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0) + pthread_cond_signal (&cv); + + return NULL; +} + +static timing_t +test_condvar (long iters, int filler) +{ + timing_t start, stop, cur; + pthread_t helper_id; + + pthread_mutex_init (&m, NULL); + pthread_cond_init (&cv, NULL); + pthread_mutex_lock (&m); + + __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED); + pthread_create (&helper_id, NULL, test_condvar_helper, &iters); + + TIMING_NOW (start); + for (long j = iters; j >= 0; --j) + { + pthread_cond_wait (&cv, &m); + FILLER_GOES_HERE; + } + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + pthread_mutex_unlock (&m); + __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED); + + pthread_join (helper_id, NULL); + return cur; +} + +/* How many items are "queued" in our pretend queue. */ +static int queued = 0; + +typedef struct Producer_Params { + long iters; + int filler; +} Producer_Params; + +/* We only benchmark the consumer thread, but both threads are doing + essentially the same thing, and never run in parallel due to the + locks. Thus, even if they run on separate processing cores, we + count the time for both threads. */ +static void * +test_producer_thread (void *v) +{ + Producer_Params *p = (Producer_Params *) v; + long iters = p->iters; + int filler = p->filler; + long j; + + for (j = iters; j >= 0; --j) + { + /* Aquire lock on the queue. */ + pthread_mutex_lock (&m); + /* if something's already there, wait. */ + while (queued > 0) + pthread_cond_wait (&consumer_c, &m); + + /* Put something on the queue */ + FILLER_GOES_HERE; + ++ queued; + pthread_cond_signal (&producer_c); + + /* Give the other thread a chance to run. */ + pthread_mutex_unlock (&m); + } + + return NULL; +} + +static timing_t +test_consumer_producer (long iters, int filler) +{ + timing_t start, stop, cur; + pthread_t helper_id; + Producer_Params p; + + p.iters = iters; + p.filler = filler; + + pthread_mutex_init (&m, NULL); + pthread_cond_init (&cv, NULL); + + pthread_create (&helper_id, NULL, test_producer_thread, &p); + + TIMING_NOW (start); + + for (long j = iters; j >= 0; --j) + { + /* Aquire lock on the queue. */ + pthread_mutex_lock (&m); + /* Wait for something to be on the queue. */ + while (queued == 0) + pthread_cond_wait (&producer_c, &m); + + /* Take if off. */ + FILLER_GOES_HERE; + -- queued; + pthread_cond_signal (&consumer_c); + + /* Give the other thread a chance to run. */ + pthread_mutex_unlock (&m); + } + + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + + pthread_join (helper_id, NULL); + return cur; +} + +/* Number of runs we use for computing mean and standard deviation. + We actually do two additional runs and discard the outliers. */ +#define RUN_COUNT 10 + +static int +do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *js) +{ + timing_t cur; + struct timeval ts, te; + double tsd, ted, td; + long iters, iters_limit; + timing_t curs[RUN_COUNT + 2]; + int i, j; + double mean, stdev; + + iters = START_ITERS; + iters_limit = LONG_MAX / 100; + + while (1) { + gettimeofday (&ts, NULL); + cur = func(iters, filler); + gettimeofday (&te, NULL); + + /* We want a test to take at least 0.01 seconds, and try + increasingly larger iteration counts until it does. This + allows for approximately constant-time tests regardless of + hardware speed, without the overhead of checking the time + inside the test loop itself. We stop at a million iterations + as that should be precise enough. Once we determine a suitable + iteration count, we run the test multiple times to calculate + mean and standard deviation. */ + + /* Note that this also primes the CPU cache and triggers faster + MHz, we hope. */ + tsd = ts.tv_sec + ts.tv_usec / 1000000.0; + ted = te.tv_sec + te.tv_usec / 1000000.0; + td = ted - tsd; + if (td >= 0.01 + || iters >= iters_limit + || iters >= 1000000) + break; + + iters *= 10; + } + + curs[0] = cur; + for (i = 1; i < RUN_COUNT + 2; i ++) + curs[i] = func(iters, filler); + + /* We sort the results so we can discard the fastest and slowest + times as outliers. In theory we should keep the fastest time, + but IMHO this is more fair. A simple bubble sort suffices. */ + + for (i = 0; i < RUN_COUNT + 1; i ++) + for (j = i + 1; j < RUN_COUNT + 2; j ++) + if (curs[i] > curs[j]) + { + timing_t temp = curs[i]; + curs[i] = curs[j]; + curs[j] = temp; + } + + /* Now calculate mean and standard deviation, skipping the outliers. */ + mean = 0.0; + for (i = 1; i<RUN_COUNT + 1; i ++) + mean += (double) curs[i] / (double) iters; + mean /= RUN_COUNT; + + stdev = 0.0; + for (i = 1; i < RUN_COUNT + 1; i ++) + { + double s = (double) curs[i] / (double) iters - mean; + stdev += s * s; + } + stdev = sqrt (stdev / (RUN_COUNT - 1)); + + json_attr_object_begin (js, filler ? "filler" : "empty"); + + json_attr_double (js, "duration", (double) cur); + json_attr_double (js, "iterations", (double) iters); + json_attr_double (js, "wall_sec", (double) td); + json_attr_double (js, "mean", mean); + json_attr_double (js, "stdev", stdev); + json_attr_double (js, "min_outlier", (double) curs[0] / (double) iters); + json_attr_double (js, "min", (double) curs[1] / (double) iters); + json_attr_double (js, "max", (double) curs[RUN_COUNT] / (double) iters); + json_attr_double (js, "max_outlier", (double) curs[RUN_COUNT + 1] / (double) iters); + + json_attr_object_end (js); + + return 0; +} + +static int +do_bench_1 (const char *name, test_t func, json_ctx_t *js) +{ + int rv = 0; + + json_attr_object_begin (js, name); + + rv += do_bench_2 (name, func, 0, js); + rv += do_bench_2 (name, func, 1, js); + + json_attr_object_end (js); + + return rv; +} + +int +do_bench (void) +{ + int rv = 0; + json_ctx_t json_ctx; + + json_init (&json_ctx, 2, stdout); + json_attr_object_begin (&json_ctx, "pthread_locks"); + +#define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx) + + BENCH (mutex); + BENCH (mutex_trylock); + BENCH (rwlock_read); + BENCH (rwlock_tryread); + BENCH (rwlock_write); + BENCH (rwlock_trywrite); + BENCH (spin_lock); + BENCH (spin_trylock); + BENCH (sem_wait); + BENCH (sem_trywait); + BENCH (condvar); + BENCH (consumer_producer); + + json_attr_object_end (&json_ctx); + + return rv; +} + + +#define TEST_FUNCTION do_bench () + +#include "../test-skeleton.c" ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [v2] New benchtest: pthread locks 2020-10-10 3:55 ` [v2] " DJ Delorie @ 2020-10-12 7:05 ` Paul Zimmermann 2020-10-12 17:33 ` DJ Delorie 2020-10-14 1:27 ` Siddhesh Poyarekar 1 sibling, 1 reply; 8+ messages in thread From: Paul Zimmermann @ 2020-10-12 7:05 UTC (permalink / raw) To: DJ Delorie; +Cc: libc-alpha Hi DJ, here is what I find on my processor (Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz) with your patch (note that it was under heavy load). Does it look ok? Best regards, Paul "pthread_locks": { "mutex": { "empty": { "duration": 1.29659e+08, "iterations": 1e+06, "wall_sec": 0.0393829, "mean": 103.086, "stdev": 18.8037, "min_outlier": 91.389, "min": 91.396, "max": 130.909, "max_outlier": 130.91 }, "filler": { "duration": 1.81231e+08, "iterations": 1e+06, "wall_sec": 0.055047, "mean": 173.598, "stdev": 12.8244, "min_outlier": 166.75, "min": 166.751, "max": 206.287, "max_outlier": 208.943 } }, "mutex_trylock": { "empty": { "duration": 8.00459e+07, "iterations": 1e+06, "wall_sec": 0.0243139, "mean": 54.1088, "stdev": 19.6554, "min_outlier": 27.3545, "min": 27.3548, "max": 80.0422, "max_outlier": 80.0459 }, "filler": { "duration": 1.43876e+08, "iterations": 1e+06, "wall_sec": 0.0437009, "mean": 134.758, "stdev": 16.2904, "min_outlier": 104.209, "min": 104.217, "max": 143.958, "max_outlier": 144.563 } }, "rwlock_read": { "empty": { "duration": 1.38636e+08, "iterations": 1e+06, "wall_sec": 0.0421081, "mean": 121.985, "stdev": 15.337, "min_outlier": 93.4482, "min": 99.1126, "max": 138.636, "max_outlier": 151.863 }, "filler": { "duration": 2.11036e+08, "iterations": 1e+06, "wall_sec": 0.0641, "mean": 186.195, "stdev": 25.7311, "min_outlier": 156.256, "min": 157.995, "max": 223.986, "max_outlier": 227.361 } }, "rwlock_tryread": { "empty": { "duration": 8.48573e+06, "iterations": 1e+06, "wall_sec": 0.00257683, "mean": 12.4413, "stdev": 12.496, "min_outlier": 8.48572, "min": 8.48573, "max": 48.0057, "max_outlier": 48.0071 }, "filler": { "duration": 6.14027e+07, "iterations": 100000, "wall_sec": 0.0186498, "mean": 99.9205, "stdev": 124.966, "min_outlier": 60.3437, "min": 60.3437, "max": 455.579, "max_outlier": 614.027 } }, "rwlock_write": { "empty": { "duration": 4.82128e+07, "iterations": 100000, "wall_sec": 0.0146441, "mean": 179.078, "stdev": 164.916, "min_outlier": 86.77, "min": 86.7724, "max": 482.109, "max_outlier": 482.128 }, "filler": { "duration": 2.29868e+08, "iterations": 1e+06, "wall_sec": 0.0698202, "mean": 222.557, "stdev": 12.5448, "min_outlier": 190.339, "min": 190.977, "max": 229.963, "max_outlier": 243.063 } }, "rwlock_trywrite": { "empty": { "duration": 4.70629e+07, "iterations": 1e+06, "wall_sec": 0.0142951, "mean": 15.4493, "stdev": 16.6618, "min_outlier": 7.54287, "min": 7.54287, "max": 47.0629, "max_outlier": 47.063 }, "filler": { "duration": 1.0035e+08, "iterations": 1e+06, "wall_sec": 0.0304809, "mean": 116.491, "stdev": 23.9815, "min_outlier": 97.7879, "min": 97.9946, "max": 158.148, "max_outlier": 169.458 } }, "spin_lock": { "empty": { "duration": 2.26345e+07, "iterations": 1e+06, "wall_sec": 0.00687504, "mean": 42.3982, "stdev": 20.8246, "min_outlier": 22.6345, "min": 22.6348, "max": 62.1574, "max_outlier": 75.329 }, "filler": { "duration": 1.43121e+08, "iterations": 1e+06, "wall_sec": 0.0434711, "mean": 130.691, "stdev": 20.6261, "min_outlier": 103.234, "min": 103.728, "max": 158.001, "max_outlier": 160.629 } }, "spin_trylock": { "empty": { "duration": 6.40642e+07, "iterations": 1e+06, "wall_sec": 0.019459, "mean": 49.7264, "stdev": 18.2296, "min_outlier": 24.52, "min": 24.5262, "max": 64.0642, "max_outlier": 77.2103 }, "filler": { "duration": 9.69735e+07, "iterations": 1e+06, "wall_sec": 0.029454, "mean": 110.624, "stdev": 18.102, "min_outlier": 92.2759, "min": 96.6148, "max": 136.139, "max_outlier": 136.344 } }, "sem_wait": { "empty": { "duration": 1.40348e+08, "iterations": 1e+06, "wall_sec": 0.042629, "mean": 124.395, "stdev": 16.9173, "min_outlier": 100.823, "min": 100.825, "max": 140.354, "max_outlier": 172.701 }, "filler": { "duration": 2.09171e+08, "iterations": 1e+06, "wall_sec": 0.0635331, "mean": 177.185, "stdev": 19.1812, "min_outlier": 145.352, "min": 155.315, "max": 209.171, "max_outlier": 220.138 } }, "sem_trywait": { "empty": { "duration": 4.02749e+07, "iterations": 100000, "wall_sec": 0.012233, "mean": 7.54332, "stdev": 0.000211187, "min_outlier": 7.54305, "min": 7.54305, "max": 7.54359, "max_outlier": 402.749 }, "filler": { "duration": 1.00351e+08, "iterations": 1e+06, "wall_sec": 0.0304799, "mean": 123.021, "stdev": 19.3645, "min_outlier": 99.7954, "min": 100.351, "max": 139.863, "max_outlier": 139.869 } }, "condvar": { "empty": { "duration": 5.1244e+07, "iterations": 1000, "wall_sec": 0.015655, "mean": 45379.1, "stdev": 5285.79, "min_outlier": 18260.9, "min": 31450.7, "max": 51244, "max_outlier": 57756.6 }, "filler": { "duration": 4.73658e+07, "iterations": 1000, "wall_sec": 0.0144019, "mean": 40550.8, "stdev": 10623.3, "min_outlier": 18126, "min": 18233.3, "max": 52647.8, "max_outlier": 57678.7 } }, "consumer_producer": { "empty": { "duration": 8.02147e+07, "iterations": 1000, "wall_sec": 0.0243819, "mean": 77217.4, "stdev": 25019.9, "min_outlier": 51949, "min": 52390.9, "max": 115883, "max_outlier": 118116 }, "filler": { "duration": 9.22739e+07, "iterations": 1000, "wall_sec": 0.028044, "mean": 82932, "stdev": 23952.2, "min_outlier": 52421.7, "min": 54514.3, "max": 112212, "max_outlier": 115371 } } }, ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [v2] New benchtest: pthread locks 2020-10-12 7:05 ` Paul Zimmermann @ 2020-10-12 17:33 ` DJ Delorie 2020-10-13 8:25 ` Paul Zimmermann 0 siblings, 1 reply; 8+ messages in thread From: DJ Delorie @ 2020-10-12 17:33 UTC (permalink / raw) To: Paul Zimmermann; +Cc: libc-alpha Those look like the type of results I expect, yes. Thanks! ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [v2] New benchtest: pthread locks 2020-10-12 17:33 ` DJ Delorie @ 2020-10-13 8:25 ` Paul Zimmermann 0 siblings, 0 replies; 8+ messages in thread From: Paul Zimmermann @ 2020-10-13 8:25 UTC (permalink / raw) To: DJ Delorie; +Cc: libc-alpha > From: DJ Delorie <dj@redhat.com> > Date: Mon, 12 Oct 2020 13:33:56 -0400 > > Those look like the type of results I expect, yes. > Thanks! as a user I vote for that patch, which is quite useful. I cannot review the code itself (except syntactically) since I'm not a specialist of locks. Paul Zimmermann ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [v2] New benchtest: pthread locks 2020-10-10 3:55 ` [v2] " DJ Delorie 2020-10-12 7:05 ` Paul Zimmermann @ 2020-10-14 1:27 ` Siddhesh Poyarekar 2020-10-14 1:32 ` DJ Delorie 2020-10-21 15:18 ` DJ Delorie 1 sibling, 2 replies; 8+ messages in thread From: Siddhesh Poyarekar @ 2020-10-14 1:27 UTC (permalink / raw) To: DJ Delorie, libc-alpha On 10/10/20 9:25 AM, DJ Delorie via Libc-alpha wrote: > > v2 changes do_bench_2 to run multiple passes and provide a mean and > standard deviation as well as some other statistics, since I got tired > of running the whole benchmark multiple times and computing them > manually ;-) > > From 94653e93eec9403ebf61b00179799751ce12088d Mon Sep 17 00:00:00 2001 > From: DJ Delorie <dj@redhat.com> > Date: Wed, 7 Oct 2020 17:04:12 -0400 > Subject: New benchtest: pthread locks > > Performance benchmarks for various posix locks: mutex, rwlock, > spinlock, condvar, and semaphore. Each test is performed with > an empty loop body or with a computationally "interesting" (i.e. > difficult to optimize away, and used just to allow lock code to > be "hidden" in the filler's CPU cycles). This is good to commit, thanks for writing these tests! > +/* The point of this benchmark is to measure the overhead of an empty > + critical section or a small critical section. This is never going > + to be indicative of real application performance. Instead we are > + trying to benchmark the effects of the compiler and the runtime > + coupled with a particular set of hardware atomic operations. > + The numbers from this benchmark should be taken with a massive gain > + of salt and viewed through the eyes of expert reviewers. */ That is a great comment. All benchmarks should have a goal set like this so that it is clear to whoever evaluates them that just getting better numbers in a specific benchmark is not good enough and one would have to evaluate why a certain set of results supports their optimisation. Siddhesh ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [v2] New benchtest: pthread locks 2020-10-14 1:27 ` Siddhesh Poyarekar @ 2020-10-14 1:32 ` DJ Delorie 2020-10-21 15:18 ` DJ Delorie 1 sibling, 0 replies; 8+ messages in thread From: DJ Delorie @ 2020-10-14 1:32 UTC (permalink / raw) To: Siddhesh Poyarekar; +Cc: libc-alpha Siddhesh Poyarekar <siddhesh@gotplt.org> writes: >> +/* The point of this benchmark is to measure the overhead of an empty >> + critical section or a small critical section. This is never going >> + to be indicative of real application performance. Instead we are >> + trying to benchmark the effects of the compiler and the runtime >> + coupled with a particular set of hardware atomic operations. >> + The numbers from this benchmark should be taken with a massive gain >> + of salt and viewed through the eyes of expert reviewers. */ > > That is a great comment. All benchmarks should have a goal set like > this so that it is clear to whoever evaluates them that just getting > better numbers in a specific benchmark is not good enough and one would > have to evaluate why a certain set of results supports their optimisation. IIRC Carlos provided that text in one of his internal pre-reviews, so kudos to him. The remainder of the comments are mine though. ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [v2] New benchtest: pthread locks 2020-10-14 1:27 ` Siddhesh Poyarekar 2020-10-14 1:32 ` DJ Delorie @ 2020-10-21 15:18 ` DJ Delorie 1 sibling, 0 replies; 8+ messages in thread From: DJ Delorie @ 2020-10-21 15:18 UTC (permalink / raw) To: Siddhesh Poyarekar; +Cc: libc-alpha Siddhesh Poyarekar <siddhesh@gotplt.org> writes: > This is good to commit, thanks for writing these tests! Thanks! Pushed. ^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2020-10-21 15:18 UTC | newest] Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2020-10-08 5:34 New benchtest: pthread locks DJ Delorie 2020-10-10 3:55 ` [v2] " DJ Delorie 2020-10-12 7:05 ` Paul Zimmermann 2020-10-12 17:33 ` DJ Delorie 2020-10-13 8:25 ` Paul Zimmermann 2020-10-14 1:27 ` Siddhesh Poyarekar 2020-10-14 1:32 ` DJ Delorie 2020-10-21 15:18 ` DJ Delorie
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).