Reuses infrastructure from previous pthread_mutex_lock benchmarks to test other performance sensitive functions. --- benchtests/Makefile | 10 ++++- ...utex-locks.c => bench-pthread-lock-base.c} | 20 +++++----- benchtests/bench-pthread-mutex-lock.c | 32 ++++++++++++++++ benchtests/bench-pthread-mutex-trylock.c | 37 +++++++++++++++++++ benchtests/bench-pthread-spin-lock.c | 30 +++++++++++++++ benchtests/bench-pthread-spin-trylock.c | 34 +++++++++++++++++ 6 files changed, 151 insertions(+), 12 deletions(-) rename benchtests/{bench-pthread-mutex-locks.c => bench-pthread-lock-base.c} (93%) create mode 100644 benchtests/bench-pthread-mutex-lock.c create mode 100644 benchtests/bench-pthread-mutex-trylock.c create mode 100644 benchtests/bench-pthread-spin-lock.c create mode 100644 benchtests/bench-pthread-spin-trylock.c diff --git a/benchtests/Makefile b/benchtests/Makefile index d99771be74..fc1cda7fc3 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -103,11 +103,19 @@ endif bench-pthread := \ pthread-locks \ - pthread-mutex-locks \ + pthread-mutex-lock \ + pthread-mutex-trylock \ + pthread-spin-lock \ + pthread-spin-trylock \ pthread_once \ thread_create \ # bench-pthread +LDLIBS-bench-pthread-mutex-lock += -lm +LDLIBS-bench-pthread-mutex-trylock += -lm +LDLIBS-bench-pthread-spin-lock += -lm +LDLIBS-bench-pthread-spin-trylock += -lm + bench-string := \ ffs \ ffsll \ diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-lock-base.c similarity index 93% rename from benchtests/bench-pthread-mutex-locks.c rename to benchtests/bench-pthread-lock-base.c index 1685b9dd1f..fac8a12b52 100644 --- a/benchtests/bench-pthread-mutex-locks.c +++ b/benchtests/bench-pthread-lock-base.c @@ -1,4 +1,4 @@ -/* Measure mutex_lock for different threads and critical sections. +/* Measure lock functions for different threads and critical sections. Copyright (C) 2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,7 +17,6 @@ <https://www.gnu.org/licenses/>. */ #define TEST_MAIN -#define TEST_NAME "pthread-mutex-locks" #define TIMEOUT (20 * 60) #include <stdio.h> @@ -31,8 +30,8 @@ #include "bench-timing.h" #include "json-lib.h" -static pthread_mutex_t lock; -static pthread_mutexattr_t attr; +static bench_lock_t lock; +static bench_lock_attr_t attr; static pthread_barrier_t barrier; #define START_ITERS 1000 @@ -104,9 +103,9 @@ worker (void *v) TIMING_NOW (start); while (iters--) { - pthread_mutex_lock (&lock); + LOCK (&lock); critical_section (crt_len); - pthread_mutex_unlock (&lock); + UNLOCK (&lock); non_critical_section (non_crt_len); } TIMING_NOW (stop); @@ -123,7 +122,7 @@ do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) Worker_Params *p, params[num_threads]; pthread_t threads[num_threads]; - pthread_mutex_init (&lock, &attr); + LOCK_INIT (&lock, &attr); pthread_barrier_init (&barrier, NULL, num_threads); for (i = 0; i < num_threads; i++) @@ -137,7 +136,7 @@ do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) for (i = 0; i < num_threads; i++) pthread_join (threads[i], NULL); - pthread_mutex_destroy (&lock); + LOCK_DESTROY (&lock); pthread_barrier_destroy (&barrier); mean = 0; @@ -246,7 +245,7 @@ do_bench (void) char name[128]; json_init (&json_ctx, 2, stdout); - json_attr_object_begin (&json_ctx, "pthread_mutex_locks"); + json_attr_object_begin (&json_ctx, TEST_NAME); /* The thread config begins from 1, and increases by 2x until nprocs. We also wants to test over-saturation case (1.25*nprocs). */ @@ -260,8 +259,7 @@ do_bench (void) threads[th_conf++] = nprocs; threads[th_conf++] = nprocs + nprocs / 4; - pthread_mutexattr_init (&attr); - pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP); + LOCK_ATTR_INIT (&attr); snprintf (name, sizeof name, "type=adaptive"); for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++) diff --git a/benchtests/bench-pthread-mutex-lock.c b/benchtests/bench-pthread-mutex-lock.c new file mode 100644 index 0000000000..16556d4116 --- /dev/null +++ b/benchtests/bench-pthread-mutex-lock.c @@ -0,0 +1,32 @@ +/* Measure mutex_lock for different threads and critical sections. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define LOCK(lock) pthread_mutex_lock (lock) +#define UNLOCK(lock) pthread_mutex_unlock (lock) +#define LOCK_INIT(lock, attr) pthread_mutex_init (lock, attr) +#define LOCK_DESTROY(lock) pthread_mutex_destroy (lock) +#define LOCK_ATTR_INIT(attr) \ + pthread_mutexattr_init (attr); \ + pthread_mutexattr_settype (attr, PTHREAD_MUTEX_ADAPTIVE_NP); + +#define bench_lock_t pthread_mutex_t +#define bench_lock_attr_t pthread_mutexattr_t + +#define TEST_NAME "pthread-mutex-lock" + +#include "bench-pthread-lock-base.c" diff --git a/benchtests/bench-pthread-mutex-trylock.c b/benchtests/bench-pthread-mutex-trylock.c new file mode 100644 index 0000000000..66318f499f --- /dev/null +++ b/benchtests/bench-pthread-mutex-trylock.c @@ -0,0 +1,37 @@ +/* Measure mutex_trylock for different threads and critical sections. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define LOCK(lock) \ + while (pthread_mutex_trylock (lock) != 0) \ + { \ + non_critical_section (non_crt_len); \ + } + +#define UNLOCK(lock) pthread_mutex_unlock (lock) +#define LOCK_INIT(lock, attr) pthread_mutex_init (lock, attr) +#define LOCK_DESTROY(lock) pthread_mutex_destroy (lock) +#define LOCK_ATTR_INIT(attr) \ + pthread_mutexattr_init (attr); \ + pthread_mutexattr_settype (attr, PTHREAD_MUTEX_ADAPTIVE_NP); + +#define bench_lock_t pthread_mutex_t +#define bench_lock_attr_t pthread_mutexattr_t + +#define TEST_NAME "pthread-mutex-trylock" + +#include "bench-pthread-lock-base.c" diff --git a/benchtests/bench-pthread-spin-lock.c b/benchtests/bench-pthread-spin-lock.c new file mode 100644 index 0000000000..2174933d6b --- /dev/null +++ b/benchtests/bench-pthread-spin-lock.c @@ -0,0 +1,30 @@ +/* Measure mutex_trylock for different threads and critical sections. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define LOCK(lock) pthread_spin_lock (lock) +#define UNLOCK(lock) pthread_spin_unlock (lock) +#define LOCK_INIT(lock, attr) pthread_spin_init (lock, *(attr)) +#define LOCK_DESTROY(lock) pthread_spin_destroy (lock) +#define LOCK_ATTR_INIT(attr) *(attr) = 0 + +#define bench_lock_t pthread_spinlock_t +#define bench_lock_attr_t int + +#define TEST_NAME "pthread-spin-lock" + +#include "bench-pthread-lock-base.c" diff --git a/benchtests/bench-pthread-spin-trylock.c b/benchtests/bench-pthread-spin-trylock.c new file mode 100644 index 0000000000..49eb972761 --- /dev/null +++ b/benchtests/bench-pthread-spin-trylock.c @@ -0,0 +1,34 @@ +/* Measure spin_trylock for different threads and critical sections. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define LOCK(lock) \ + while (pthread_spin_lock (lock) != 0) \ + { \ + non_critical_section (non_crt_len); \ + } +#define UNLOCK(lock) pthread_spin_unlock (lock) +#define LOCK_INIT(lock, attr) pthread_spin_init (lock, *(attr)) +#define LOCK_DESTROY(lock) pthread_spin_destroy (lock) +#define LOCK_ATTR_INIT(attr) *(attr) = 0 + +#define bench_lock_t pthread_spinlock_t +#define bench_lock_attr_t int + +#define TEST_NAME "pthread-spin-trylock" + +#include "bench-pthread-lock-base.c" -- 2.34.1
Save a jmp on the lock path coming from an initial failure in pthread_spin_lock.S. This costs 4-bytes of code but since the function still fits in the same number of 16-byte blocks (default function alignment) it does not have affect on the total binary size of libc.so (unchanged after this commit). pthread_spin_trylock was using a CAS when a simple xchg works which is often more expensive. Full check passes on x86-64. --- sysdeps/x86_64/nptl/pthread_spin_lock.S | 23 +++++++++++++++------- sysdeps/x86_64/nptl/pthread_spin_trylock.S | 18 ++++++++++++----- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/sysdeps/x86_64/nptl/pthread_spin_lock.S b/sysdeps/x86_64/nptl/pthread_spin_lock.S index 44b837d9db..1e09e59b10 100644 --- a/sysdeps/x86_64/nptl/pthread_spin_lock.S +++ b/sysdeps/x86_64/nptl/pthread_spin_lock.S @@ -19,18 +19,27 @@ #include <shlib-compat.h> ENTRY(__pthread_spin_lock) -1: LOCK - decl 0(%rdi) - jne 2f + /* Always return zero. */ xor %eax, %eax + LOCK + decl 0(%rdi) + jne 1f ret .align 16 -2: rep +1: + /* `rep nop` == `pause`. */ + rep nop - cmpl $0, 0(%rdi) - jg 1b - jmp 2b + cmpl %eax, 0(%rdi) + jle 1b + /* Just repeat the `lock decl` logic here. The code size save + of jumping back to entry doesn't change how many 16-byte + chunks (default function alignment) that the code fits in. */ + LOCK + decl 0(%rdi) + jne 1b + ret END(__pthread_spin_lock) versioned_symbol (libc, __pthread_spin_lock, pthread_spin_lock, GLIBC_2_34) diff --git a/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/sysdeps/x86_64/nptl/pthread_spin_trylock.S index fffdb27dd9..a1f97cb420 100644 --- a/sysdeps/x86_64/nptl/pthread_spin_trylock.S +++ b/sysdeps/x86_64/nptl/pthread_spin_trylock.S @@ -20,13 +20,21 @@ #include <shlib-compat.h> ENTRY(__pthread_spin_trylock) - movl $1, %eax xorl %ecx, %ecx - lock - cmpxchgl %ecx, (%rdi) + /* xchg has implicit LOCK prefix. */ + xchgl %ecx, (%rdi) + + /* Branch on result. Expectation is the use of trylock will be + branching on success/failure so this branch can be used to + to predict the coming branch. It has the benefit of + breaking the likely expensive memory dependency on (%rdi). */ + cmpl $1, %ecx + jnz 1f + xorl %eax, %eax + ret +1: movl $EBUSY, %eax - cmovel %ecx, %eax - retq + ret END(__pthread_spin_trylock) versioned_symbol (libc, __pthread_spin_trylock, pthread_spin_trylock, GLIBC_2_34) -- 2.34.1
Reuses infrastructure from previous pthread_mutex_lock benchmarks to test other performance sensitive functions. --- benchtests/Makefile | 10 ++++- ...utex-locks.c => bench-pthread-lock-base.c} | 20 +++++----- benchtests/bench-pthread-mutex-lock.c | 32 ++++++++++++++++ benchtests/bench-pthread-mutex-trylock.c | 37 +++++++++++++++++++ benchtests/bench-pthread-spin-lock.c | 30 +++++++++++++++ benchtests/bench-pthread-spin-trylock.c | 34 +++++++++++++++++ 6 files changed, 151 insertions(+), 12 deletions(-) rename benchtests/{bench-pthread-mutex-locks.c => bench-pthread-lock-base.c} (93%) create mode 100644 benchtests/bench-pthread-mutex-lock.c create mode 100644 benchtests/bench-pthread-mutex-trylock.c create mode 100644 benchtests/bench-pthread-spin-lock.c create mode 100644 benchtests/bench-pthread-spin-trylock.c diff --git a/benchtests/Makefile b/benchtests/Makefile index d99771be74..fc1cda7fc3 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -103,11 +103,19 @@ endif bench-pthread := \ pthread-locks \ - pthread-mutex-locks \ + pthread-mutex-lock \ + pthread-mutex-trylock \ + pthread-spin-lock \ + pthread-spin-trylock \ pthread_once \ thread_create \ # bench-pthread +LDLIBS-bench-pthread-mutex-lock += -lm +LDLIBS-bench-pthread-mutex-trylock += -lm +LDLIBS-bench-pthread-spin-lock += -lm +LDLIBS-bench-pthread-spin-trylock += -lm + bench-string := \ ffs \ ffsll \ diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-lock-base.c similarity index 93% rename from benchtests/bench-pthread-mutex-locks.c rename to benchtests/bench-pthread-lock-base.c index 1685b9dd1f..fac8a12b52 100644 --- a/benchtests/bench-pthread-mutex-locks.c +++ b/benchtests/bench-pthread-lock-base.c @@ -1,4 +1,4 @@ -/* Measure mutex_lock for different threads and critical sections. +/* Measure lock functions for different threads and critical sections. Copyright (C) 2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,7 +17,6 @@ <https://www.gnu.org/licenses/>. */ #define TEST_MAIN -#define TEST_NAME "pthread-mutex-locks" #define TIMEOUT (20 * 60) #include <stdio.h> @@ -31,8 +30,8 @@ #include "bench-timing.h" #include "json-lib.h" -static pthread_mutex_t lock; -static pthread_mutexattr_t attr; +static bench_lock_t lock; +static bench_lock_attr_t attr; static pthread_barrier_t barrier; #define START_ITERS 1000 @@ -104,9 +103,9 @@ worker (void *v) TIMING_NOW (start); while (iters--) { - pthread_mutex_lock (&lock); + LOCK (&lock); critical_section (crt_len); - pthread_mutex_unlock (&lock); + UNLOCK (&lock); non_critical_section (non_crt_len); } TIMING_NOW (stop); @@ -123,7 +122,7 @@ do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) Worker_Params *p, params[num_threads]; pthread_t threads[num_threads]; - pthread_mutex_init (&lock, &attr); + LOCK_INIT (&lock, &attr); pthread_barrier_init (&barrier, NULL, num_threads); for (i = 0; i < num_threads; i++) @@ -137,7 +136,7 @@ do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) for (i = 0; i < num_threads; i++) pthread_join (threads[i], NULL); - pthread_mutex_destroy (&lock); + LOCK_DESTROY (&lock); pthread_barrier_destroy (&barrier); mean = 0; @@ -246,7 +245,7 @@ do_bench (void) char name[128]; json_init (&json_ctx, 2, stdout); - json_attr_object_begin (&json_ctx, "pthread_mutex_locks"); + json_attr_object_begin (&json_ctx, TEST_NAME); /* The thread config begins from 1, and increases by 2x until nprocs. We also wants to test over-saturation case (1.25*nprocs). */ @@ -260,8 +259,7 @@ do_bench (void) threads[th_conf++] = nprocs; threads[th_conf++] = nprocs + nprocs / 4; - pthread_mutexattr_init (&attr); - pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP); + LOCK_ATTR_INIT (&attr); snprintf (name, sizeof name, "type=adaptive"); for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++) diff --git a/benchtests/bench-pthread-mutex-lock.c b/benchtests/bench-pthread-mutex-lock.c new file mode 100644 index 0000000000..16556d4116 --- /dev/null +++ b/benchtests/bench-pthread-mutex-lock.c @@ -0,0 +1,32 @@ +/* Measure mutex_lock for different threads and critical sections. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define LOCK(lock) pthread_mutex_lock (lock) +#define UNLOCK(lock) pthread_mutex_unlock (lock) +#define LOCK_INIT(lock, attr) pthread_mutex_init (lock, attr) +#define LOCK_DESTROY(lock) pthread_mutex_destroy (lock) +#define LOCK_ATTR_INIT(attr) \ + pthread_mutexattr_init (attr); \ + pthread_mutexattr_settype (attr, PTHREAD_MUTEX_ADAPTIVE_NP); + +#define bench_lock_t pthread_mutex_t +#define bench_lock_attr_t pthread_mutexattr_t + +#define TEST_NAME "pthread-mutex-lock" + +#include "bench-pthread-lock-base.c" diff --git a/benchtests/bench-pthread-mutex-trylock.c b/benchtests/bench-pthread-mutex-trylock.c new file mode 100644 index 0000000000..66318f499f --- /dev/null +++ b/benchtests/bench-pthread-mutex-trylock.c @@ -0,0 +1,37 @@ +/* Measure mutex_trylock for different threads and critical sections. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define LOCK(lock) \ + while (pthread_mutex_trylock (lock) != 0) \ + { \ + non_critical_section (non_crt_len); \ + } + +#define UNLOCK(lock) pthread_mutex_unlock (lock) +#define LOCK_INIT(lock, attr) pthread_mutex_init (lock, attr) +#define LOCK_DESTROY(lock) pthread_mutex_destroy (lock) +#define LOCK_ATTR_INIT(attr) \ + pthread_mutexattr_init (attr); \ + pthread_mutexattr_settype (attr, PTHREAD_MUTEX_ADAPTIVE_NP); + +#define bench_lock_t pthread_mutex_t +#define bench_lock_attr_t pthread_mutexattr_t + +#define TEST_NAME "pthread-mutex-trylock" + +#include "bench-pthread-lock-base.c" diff --git a/benchtests/bench-pthread-spin-lock.c b/benchtests/bench-pthread-spin-lock.c new file mode 100644 index 0000000000..2174933d6b --- /dev/null +++ b/benchtests/bench-pthread-spin-lock.c @@ -0,0 +1,30 @@ +/* Measure mutex_trylock for different threads and critical sections. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define LOCK(lock) pthread_spin_lock (lock) +#define UNLOCK(lock) pthread_spin_unlock (lock) +#define LOCK_INIT(lock, attr) pthread_spin_init (lock, *(attr)) +#define LOCK_DESTROY(lock) pthread_spin_destroy (lock) +#define LOCK_ATTR_INIT(attr) *(attr) = 0 + +#define bench_lock_t pthread_spinlock_t +#define bench_lock_attr_t int + +#define TEST_NAME "pthread-spin-lock" + +#include "bench-pthread-lock-base.c" diff --git a/benchtests/bench-pthread-spin-trylock.c b/benchtests/bench-pthread-spin-trylock.c new file mode 100644 index 0000000000..49eb972761 --- /dev/null +++ b/benchtests/bench-pthread-spin-trylock.c @@ -0,0 +1,34 @@ +/* Measure spin_trylock for different threads and critical sections. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define LOCK(lock) \ + while (pthread_spin_lock (lock) != 0) \ + { \ + non_critical_section (non_crt_len); \ + } +#define UNLOCK(lock) pthread_spin_unlock (lock) +#define LOCK_INIT(lock, attr) pthread_spin_init (lock, *(attr)) +#define LOCK_DESTROY(lock) pthread_spin_destroy (lock) +#define LOCK_ATTR_INIT(attr) *(attr) = 0 + +#define bench_lock_t pthread_spinlock_t +#define bench_lock_attr_t int + +#define TEST_NAME "pthread-spin-trylock" + +#include "bench-pthread-lock-base.c" -- 2.34.1
Save a jmp on the lock path coming from an initial failure in pthread_spin_lock.S. This costs 4-bytes of code but since the function still fits in the same number of 16-byte blocks (default function alignment) it does not have affect on the total binary size of libc.so (unchanged after this commit). pthread_spin_trylock was using a CAS when a simple xchg works which is often more expensive. Full check passes on x86-64. --- sysdeps/x86_64/nptl/pthread_spin_lock.S | 23 +++++++++++++++------- sysdeps/x86_64/nptl/pthread_spin_trylock.S | 18 ++++++++++++----- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/sysdeps/x86_64/nptl/pthread_spin_lock.S b/sysdeps/x86_64/nptl/pthread_spin_lock.S index 44b837d9db..1e09e59b10 100644 --- a/sysdeps/x86_64/nptl/pthread_spin_lock.S +++ b/sysdeps/x86_64/nptl/pthread_spin_lock.S @@ -19,18 +19,27 @@ #include <shlib-compat.h> ENTRY(__pthread_spin_lock) -1: LOCK - decl 0(%rdi) - jne 2f + /* Always return zero. */ xor %eax, %eax + LOCK + decl 0(%rdi) + jne 1f ret .align 16 -2: rep +1: + /* `rep nop` == `pause`. */ + rep nop - cmpl $0, 0(%rdi) - jg 1b - jmp 2b + cmpl %eax, 0(%rdi) + jle 1b + /* Just repeat the `lock decl` logic here. The code size save + of jumping back to entry doesn't change how many 16-byte + chunks (default function alignment) that the code fits in. */ + LOCK + decl 0(%rdi) + jne 1b + ret END(__pthread_spin_lock) versioned_symbol (libc, __pthread_spin_lock, pthread_spin_lock, GLIBC_2_34) diff --git a/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/sysdeps/x86_64/nptl/pthread_spin_trylock.S index fffdb27dd9..a1f97cb420 100644 --- a/sysdeps/x86_64/nptl/pthread_spin_trylock.S +++ b/sysdeps/x86_64/nptl/pthread_spin_trylock.S @@ -20,13 +20,21 @@ #include <shlib-compat.h> ENTRY(__pthread_spin_trylock) - movl $1, %eax xorl %ecx, %ecx - lock - cmpxchgl %ecx, (%rdi) + /* xchg has implicit LOCK prefix. */ + xchgl %ecx, (%rdi) + + /* Branch on result. Expectation is the use of trylock will be + branching on success/failure so this branch can be used to + to predict the coming branch. It has the benefit of + breaking the likely expensive memory dependency on (%rdi). */ + cmpl $1, %ecx + jnz 1f + xorl %eax, %eax + ret +1: movl $EBUSY, %eax - cmovel %ecx, %eax - retq + ret END(__pthread_spin_trylock) versioned_symbol (libc, __pthread_spin_trylock, pthread_spin_trylock, GLIBC_2_34) -- 2.34.1
On Fri, Sep 30, 2022 at 9:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Save a jmp on the lock path coming from an initial failure in
> pthread_spin_lock.S. This costs 4-bytes of code but since the
> function still fits in the same number of 16-byte blocks (default
> function alignment) it does not have affect on the total binary size
> of libc.so (unchanged after this commit).
>
> pthread_spin_trylock was using a CAS when a simple xchg works which
> is often more expensive.
>
> Full check passes on x86-64.
> ---
> sysdeps/x86_64/nptl/pthread_spin_lock.S | 23 +++++++++++++++-------
> sysdeps/x86_64/nptl/pthread_spin_trylock.S | 18 ++++++++++++-----
> 2 files changed, 29 insertions(+), 12 deletions(-)
>
> diff --git a/sysdeps/x86_64/nptl/pthread_spin_lock.S b/sysdeps/x86_64/nptl/pthread_spin_lock.S
> index 44b837d9db..1e09e59b10 100644
> --- a/sysdeps/x86_64/nptl/pthread_spin_lock.S
> +++ b/sysdeps/x86_64/nptl/pthread_spin_lock.S
> @@ -19,18 +19,27 @@
> #include <shlib-compat.h>
>
> ENTRY(__pthread_spin_lock)
> -1: LOCK
> - decl 0(%rdi)
> - jne 2f
> + /* Always return zero. */
> xor %eax, %eax
> + LOCK
> + decl 0(%rdi)
> + jne 1f
> ret
>
> .align 16
> -2: rep
> +1:
> + /* `rep nop` == `pause`. */
> + rep
> nop
> - cmpl $0, 0(%rdi)
> - jg 1b
> - jmp 2b
> + cmpl %eax, 0(%rdi)
> + jle 1b
> + /* Just repeat the `lock decl` logic here. The code size save
> + of jumping back to entry doesn't change how many 16-byte
> + chunks (default function alignment) that the code fits in. */
> + LOCK
> + decl 0(%rdi)
> + jne 1b
> + ret
> END(__pthread_spin_lock)
> versioned_symbol (libc, __pthread_spin_lock, pthread_spin_lock, GLIBC_2_34)
>
> diff --git a/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
> index fffdb27dd9..a1f97cb420 100644
> --- a/sysdeps/x86_64/nptl/pthread_spin_trylock.S
> +++ b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
> @@ -20,13 +20,21 @@
> #include <shlib-compat.h>
>
> ENTRY(__pthread_spin_trylock)
> - movl $1, %eax
> xorl %ecx, %ecx
> - lock
> - cmpxchgl %ecx, (%rdi)
> + /* xchg has implicit LOCK prefix. */
> + xchgl %ecx, (%rdi)
> +
> + /* Branch on result. Expectation is the use of trylock will be
> + branching on success/failure so this branch can be used to
> + to predict the coming branch. It has the benefit of
> + breaking the likely expensive memory dependency on (%rdi). */
> + cmpl $1, %ecx
> + jnz 1f
> + xorl %eax, %eax
> + ret
> +1:
> movl $EBUSY, %eax
> - cmovel %ecx, %eax
> - retq
> + ret
> END(__pthread_spin_trylock)
> versioned_symbol (libc, __pthread_spin_trylock, pthread_spin_trylock,
> GLIBC_2_34)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
On Fri, Sep 30, 2022 at 9:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Reuses infrastructure from previous pthread_mutex_lock benchmarks to
> test other performance sensitive functions.
> ---
> benchtests/Makefile | 10 ++++-
> ...utex-locks.c => bench-pthread-lock-base.c} | 20 +++++-----
> benchtests/bench-pthread-mutex-lock.c | 32 ++++++++++++++++
> benchtests/bench-pthread-mutex-trylock.c | 37 +++++++++++++++++++
> benchtests/bench-pthread-spin-lock.c | 30 +++++++++++++++
> benchtests/bench-pthread-spin-trylock.c | 34 +++++++++++++++++
> 6 files changed, 151 insertions(+), 12 deletions(-)
> rename benchtests/{bench-pthread-mutex-locks.c => bench-pthread-lock-base.c} (93%)
> create mode 100644 benchtests/bench-pthread-mutex-lock.c
> create mode 100644 benchtests/bench-pthread-mutex-trylock.c
> create mode 100644 benchtests/bench-pthread-spin-lock.c
> create mode 100644 benchtests/bench-pthread-spin-trylock.c
>
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index d99771be74..fc1cda7fc3 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -103,11 +103,19 @@ endif
>
> bench-pthread := \
> pthread-locks \
> - pthread-mutex-locks \
> + pthread-mutex-lock \
> + pthread-mutex-trylock \
> + pthread-spin-lock \
> + pthread-spin-trylock \
> pthread_once \
> thread_create \
> # bench-pthread
>
> +LDLIBS-bench-pthread-mutex-lock += -lm
> +LDLIBS-bench-pthread-mutex-trylock += -lm
> +LDLIBS-bench-pthread-spin-lock += -lm
> +LDLIBS-bench-pthread-spin-trylock += -lm
> +
> bench-string := \
> ffs \
> ffsll \
> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-lock-base.c
> similarity index 93%
> rename from benchtests/bench-pthread-mutex-locks.c
> rename to benchtests/bench-pthread-lock-base.c
> index 1685b9dd1f..fac8a12b52 100644
> --- a/benchtests/bench-pthread-mutex-locks.c
> +++ b/benchtests/bench-pthread-lock-base.c
> @@ -1,4 +1,4 @@
> -/* Measure mutex_lock for different threads and critical sections.
> +/* Measure lock functions for different threads and critical sections.
> Copyright (C) 2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -17,7 +17,6 @@
> <https://www.gnu.org/licenses/>. */
>
> #define TEST_MAIN
> -#define TEST_NAME "pthread-mutex-locks"
> #define TIMEOUT (20 * 60)
>
> #include <stdio.h>
> @@ -31,8 +30,8 @@
> #include "bench-timing.h"
> #include "json-lib.h"
>
> -static pthread_mutex_t lock;
> -static pthread_mutexattr_t attr;
> +static bench_lock_t lock;
> +static bench_lock_attr_t attr;
> static pthread_barrier_t barrier;
>
> #define START_ITERS 1000
> @@ -104,9 +103,9 @@ worker (void *v)
> TIMING_NOW (start);
> while (iters--)
> {
> - pthread_mutex_lock (&lock);
> + LOCK (&lock);
> critical_section (crt_len);
> - pthread_mutex_unlock (&lock);
> + UNLOCK (&lock);
> non_critical_section (non_crt_len);
> }
> TIMING_NOW (stop);
> @@ -123,7 +122,7 @@ do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
> Worker_Params *p, params[num_threads];
> pthread_t threads[num_threads];
>
> - pthread_mutex_init (&lock, &attr);
> + LOCK_INIT (&lock, &attr);
> pthread_barrier_init (&barrier, NULL, num_threads);
>
> for (i = 0; i < num_threads; i++)
> @@ -137,7 +136,7 @@ do_one_test (int num_threads, int crt_len, int non_crt_len, long iters)
> for (i = 0; i < num_threads; i++)
> pthread_join (threads[i], NULL);
>
> - pthread_mutex_destroy (&lock);
> + LOCK_DESTROY (&lock);
> pthread_barrier_destroy (&barrier);
>
> mean = 0;
> @@ -246,7 +245,7 @@ do_bench (void)
> char name[128];
>
> json_init (&json_ctx, 2, stdout);
> - json_attr_object_begin (&json_ctx, "pthread_mutex_locks");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
>
> /* The thread config begins from 1, and increases by 2x until nprocs.
> We also wants to test over-saturation case (1.25*nprocs). */
> @@ -260,8 +259,7 @@ do_bench (void)
> threads[th_conf++] = nprocs;
> threads[th_conf++] = nprocs + nprocs / 4;
>
> - pthread_mutexattr_init (&attr);
> - pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
> + LOCK_ATTR_INIT (&attr);
> snprintf (name, sizeof name, "type=adaptive");
>
> for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++)
> diff --git a/benchtests/bench-pthread-mutex-lock.c b/benchtests/bench-pthread-mutex-lock.c
> new file mode 100644
> index 0000000000..16556d4116
> --- /dev/null
> +++ b/benchtests/bench-pthread-mutex-lock.c
> @@ -0,0 +1,32 @@
> +/* Measure mutex_lock for different threads and critical sections.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define LOCK(lock) pthread_mutex_lock (lock)
> +#define UNLOCK(lock) pthread_mutex_unlock (lock)
> +#define LOCK_INIT(lock, attr) pthread_mutex_init (lock, attr)
> +#define LOCK_DESTROY(lock) pthread_mutex_destroy (lock)
> +#define LOCK_ATTR_INIT(attr) \
> + pthread_mutexattr_init (attr); \
> + pthread_mutexattr_settype (attr, PTHREAD_MUTEX_ADAPTIVE_NP);
> +
> +#define bench_lock_t pthread_mutex_t
> +#define bench_lock_attr_t pthread_mutexattr_t
> +
> +#define TEST_NAME "pthread-mutex-lock"
> +
> +#include "bench-pthread-lock-base.c"
> diff --git a/benchtests/bench-pthread-mutex-trylock.c b/benchtests/bench-pthread-mutex-trylock.c
> new file mode 100644
> index 0000000000..66318f499f
> --- /dev/null
> +++ b/benchtests/bench-pthread-mutex-trylock.c
> @@ -0,0 +1,37 @@
> +/* Measure mutex_trylock for different threads and critical sections.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define LOCK(lock) \
> + while (pthread_mutex_trylock (lock) != 0) \
> + { \
> + non_critical_section (non_crt_len); \
> + }
> +
> +#define UNLOCK(lock) pthread_mutex_unlock (lock)
> +#define LOCK_INIT(lock, attr) pthread_mutex_init (lock, attr)
> +#define LOCK_DESTROY(lock) pthread_mutex_destroy (lock)
> +#define LOCK_ATTR_INIT(attr) \
> + pthread_mutexattr_init (attr); \
> + pthread_mutexattr_settype (attr, PTHREAD_MUTEX_ADAPTIVE_NP);
> +
> +#define bench_lock_t pthread_mutex_t
> +#define bench_lock_attr_t pthread_mutexattr_t
> +
> +#define TEST_NAME "pthread-mutex-trylock"
> +
> +#include "bench-pthread-lock-base.c"
> diff --git a/benchtests/bench-pthread-spin-lock.c b/benchtests/bench-pthread-spin-lock.c
> new file mode 100644
> index 0000000000..2174933d6b
> --- /dev/null
> +++ b/benchtests/bench-pthread-spin-lock.c
> @@ -0,0 +1,30 @@
> +/* Measure mutex_trylock for different threads and critical sections.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define LOCK(lock) pthread_spin_lock (lock)
> +#define UNLOCK(lock) pthread_spin_unlock (lock)
> +#define LOCK_INIT(lock, attr) pthread_spin_init (lock, *(attr))
> +#define LOCK_DESTROY(lock) pthread_spin_destroy (lock)
> +#define LOCK_ATTR_INIT(attr) *(attr) = 0
> +
> +#define bench_lock_t pthread_spinlock_t
> +#define bench_lock_attr_t int
> +
> +#define TEST_NAME "pthread-spin-lock"
> +
> +#include "bench-pthread-lock-base.c"
> diff --git a/benchtests/bench-pthread-spin-trylock.c b/benchtests/bench-pthread-spin-trylock.c
> new file mode 100644
> index 0000000000..49eb972761
> --- /dev/null
> +++ b/benchtests/bench-pthread-spin-trylock.c
> @@ -0,0 +1,34 @@
> +/* Measure spin_trylock for different threads and critical sections.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define LOCK(lock) \
> + while (pthread_spin_lock (lock) != 0) \
> + { \
> + non_critical_section (non_crt_len); \
> + }
> +#define UNLOCK(lock) pthread_spin_unlock (lock)
> +#define LOCK_INIT(lock, attr) pthread_spin_init (lock, *(attr))
> +#define LOCK_DESTROY(lock) pthread_spin_destroy (lock)
> +#define LOCK_ATTR_INIT(attr) *(attr) = 0
> +
> +#define bench_lock_t pthread_spinlock_t
> +#define bench_lock_attr_t int
> +
> +#define TEST_NAME "pthread-spin-trylock"
> +
> +#include "bench-pthread-lock-base.c"
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.