[PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c.

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c.
  2016-12-06 13:52 [PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
@ 2016-12-06 13:52 ` Stefan Liebler
  2017-01-10 16:53   ` Torvald Riegel
  2016-12-06 13:52 ` [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin Stefan Liebler
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2016-12-06 13:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: Stefan Liebler

This patch implements __libc_tbegin_retry macro which is equivalent to
gcc builtin __builtin_tbegin_retry, except the changes which were applied
to __libc_tbegin in the previous patch.

If tbegin aborts with _HTM_TBEGIN_TRANSIENT.  Then this macros restores
the fpc, fprs and automatically retries up to retry_cnt tbegins.
Further saving of the state is omitted as it is already saved in the
first round.  Before retrying a further transaction, the
transaction-abort-assist instruction is used to support the cpu.

This macro is now used in function __lll_lock_elision.

ChangeLog:

	* sysdeps/unix/sysv/linux/s390/htm.h(__libc_tbegin_retry): New macro.
	* sysdeps/unix/sysv/linux/s390/elision-lock.c (__lll_lock_elision):
	Use __libc_tbegin_retry macro.
---
 sysdeps/unix/sysv/linux/s390/elision-lock.c | 50 ++++++++++++++---------------
 sysdeps/unix/sysv/linux/s390/htm.h          | 36 +++++++++++++++++++--
 2 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
index 48cc3db..3dd7fbc 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
@@ -60,17 +60,16 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
       goto use_lock;
     }
 
-  int try_tbegin;
-  for (try_tbegin = aconf.try_tbegin;
-       try_tbegin > 0;
-       try_tbegin--)
+  if (aconf.try_tbegin > 0)
     {
-      int status;
-      if (__builtin_expect
-	  ((status = __libc_tbegin ((void *) 0)) == _HTM_TBEGIN_STARTED, 1))
+      int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);
+      if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
+			    _HTM_TBEGIN_STARTED))
 	{
-	  if (*futex == 0)
+	  if (__builtin_expect (*futex == 0, 1))
+	    /* Lock was free.  Return to user code in a transaction.  */
 	    return 0;
+
 	  /* Lock was busy.  Fall back to normal locking.  */
 	  if (__builtin_expect (__libc_tx_nesting_depth (), 1))
 	    {
@@ -81,7 +80,6 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 		 See above for why relaxed MO is sufficient.  */
 	      if (aconf.skip_lock_busy > 0)
 		atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
-	      goto use_lock;
 	    }
 	  else /* nesting depth is > 1 */
 	    {
@@ -99,28 +97,28 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 	      __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
 	    }
 	}
+      else if (status != _HTM_TBEGIN_TRANSIENT)
+	{
+	  /* A persistent abort (cc 1 or 3) indicates that a retry is
+	     probably futile.  Use the normal locking now and for the
+	     next couple of calls.
+	     Be careful to avoid writing to the lock.  See above for why
+	     relaxed MO is sufficient.  */
+	  if (aconf.skip_lock_internal_abort > 0)
+	    atomic_store_relaxed (adapt_count,
+				  aconf.skip_lock_internal_abort);
+	}
       else
 	{
-	  if (status != _HTM_TBEGIN_TRANSIENT)
-	    {
-	      /* A persistent abort (cc 1 or 3) indicates that a retry is
-		 probably futile.  Use the normal locking now and for the
-		 next couple of calls.
-		 Be careful to avoid writing to the lock.  See above for why
-		 relaxed MO is sufficient.  */
-	      if (aconf.skip_lock_internal_abort > 0)
-		atomic_store_relaxed (adapt_count,
-				      aconf.skip_lock_internal_abort);
-	      goto use_lock;
-	    }
+	  /* Same logic as above, but for for a number of temporary failures in
+	     a row.  */
+	  if (aconf.skip_lock_out_of_tbegin_retries > 0)
+	    atomic_store_relaxed (adapt_count,
+				  aconf.skip_lock_out_of_tbegin_retries);
 	}
     }
 
-  /* Same logic as above, but for for a number of temporary failures in a
-     row.  See above for why relaxed MO is sufficient.  */
-  if (aconf.skip_lock_out_of_tbegin_retries > 0 && aconf.try_tbegin > 0)
-    atomic_store_relaxed (adapt_count, aconf.skip_lock_out_of_tbegin_retries);
-
   use_lock:
+  /* Use normal locking as fallback path if transaction does not succeed.  */
   return LLL_LOCK ((*futex), private);
 }
diff --git a/sysdeps/unix/sysv/linux/s390/htm.h b/sysdeps/unix/sysv/linux/s390/htm.h
index 6b4e8f4..aa9d01a 100644
--- a/sysdeps/unix/sysv/linux/s390/htm.h
+++ b/sysdeps/unix/sysv/linux/s390/htm.h
@@ -69,7 +69,36 @@
    started.  Thus the user of the tbegin macros in this header file has to
    compile the file / function with -msoft-float.  It prevents gcc from using
    fprs / vrs.  */
-#define __libc_tbegin(tdb)						\
+#define __libc_tbegin(tdb) __libc_tbegin_base(tdb,,,)
+
+#define __libc_tbegin_retry_output_regs , [R_TX_CNT] "+&d" (__tx_cnt)
+#define __libc_tbegin_retry_input_regs(retry_cnt) , [R_RETRY] "d" (retry_cnt)
+#define __libc_tbegin_retry_abort_path_insn				\
+  /* If tbegin returned _HTM_TBEGIN_TRANSIENT, retry immediately so	\
+     that max tbegin_cnt transactions are tried.  Otherwise return and	\
+     let the caller of this macro do the fallback path.  */		\
+  "   jnh 1f\n\t" /* cc 1/3: jump to fallback path.  */			\
+  /* tbegin returned _HTM_TBEGIN_TRANSIENT: retry with transaction.  */ \
+  "   crje %[R_TX_CNT], %[R_RETRY], 1f\n\t" /* Reached max retries?  */	\
+  "   ahi %[R_TX_CNT], 1\n\t"						\
+  "   ppa %[R_TX_CNT], 0, 1\n\t" /* Transaction-Abort Assist.  */	\
+  "   j 2b\n\t" /* Loop to tbegin.  */
+
+/* Same as __libc_tbegin except if tbegin aborts with _HTM_TBEGIN_TRANSIENT.
+   Then this macros restores the fpc, fprs and automatically retries up to
+   retry_cnt tbegins.  Further saving of the state is omitted as it is already
+   saved.  This macro calls tbegin at most as retry_cnt + 1 times.  */
+#define __libc_tbegin_retry(tdb, retry_cnt)				\
+  ({ int __ret;								\
+    int __tx_cnt = 0;							\
+    __ret = __libc_tbegin_base(tdb,					\
+			       __libc_tbegin_retry_abort_path_insn,	\
+			       __libc_tbegin_retry_output_regs,		\
+			       __libc_tbegin_retry_input_regs(retry_cnt)); \
+    __ret;								\
+  })
+
+#define __libc_tbegin_base(tdb, abort_path_insn, output_regs, input_regs) \
   ({ int __ret;								\
      int __fpc;								\
      char __fprs[TX_FPRS_BYTES];					\
@@ -95,7 +124,7 @@
 			      again and result in a core dump wich does	\
 			      now show at tbegin but the real executed	\
 			      instruction.  */				\
-			   "   tbegin 0, 0xFF0E\n\t"			\
+			   "2: tbegin 0, 0xFF0E\n\t"			\
 			   /* Branch away in abort case (this is the	\
 			      prefered sequence.  See PoP in chapter 5	\
 			      Transactional-Execution Facility		\
@@ -111,11 +140,14 @@
 			   "   srl %[R_RET], 28\n\t"			\
 			   "   sfpc %[R_FPC]\n\t"			\
 			   TX_RESTORE_FPRS				\
+			   abort_path_insn				\
 			   "1:\n\t"					\
 			   ".machine pop\n"				\
 			   : [R_RET] "=&d" (__ret),			\
 			     [R_FPC] "=&d" (__fpc)			\
+			     output_regs				\
 			   : [R_FPRS] "a" (__fprs)			\
+			     input_regs					\
 			   : "cc", "memory");				\
      __ret;								\
      })
-- 
2.5.5

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
@ 2016-12-06 13:52 Stefan Liebler
  2016-12-06 13:52 ` [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c Stefan Liebler
                   ` (3 more replies)
  0 siblings, 4 replies; 35+ messages in thread
From: Stefan Liebler @ 2016-12-06 13:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: Stefan Liebler

This uses atomic operations to access lock elision metadata that is accessed
concurrently (ie, adapt_count fields).  The size of the data is less than a
word but accessed only with atomic loads and stores.

See also x86 commit ca6e601a9d4a72b3699cca15bad12ac1716bf49a:
"Use C11-like atomics instead of plain memory accesses in x86 lock elision."

ChangeLog:

	* sysdeps/unix/sysv/linux/s390/elision-lock.c
	(__lll_lock_elision): Use atomics to load / store adapt_count.
	* sysdeps/unix/sysv/linux/s390/elision-trylock.c
	(__lll_trylock_elision): Likewise.
---
 sysdeps/unix/sysv/linux/s390/elision-lock.c    | 25 ++++++++++++++++++-------
 sysdeps/unix/sysv/linux/s390/elision-trylock.c | 14 +++++++++-----
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
index ecb507e..1876d21 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
@@ -45,11 +45,18 @@
 int
 __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 {
-  if (*adapt_count > 0)
+  /* adapt_count can be accessed concurrently; these accesses can be both
+     inside of transactions (if critical sections are nested and the outer
+     critical section uses lock elision) and outside of transactions.  Thus,
+     we need to use atomic accesses to avoid data races.  However, the
+     value of adapt_count is just a hint, so relaxed MO accesses are
+     sufficient.  */
+  if (atomic_load_relaxed (adapt_count) > 0)
     {
       /* Lost updates are possible, but harmless.  Due to races this might lead
 	 to *adapt_count becoming less than zero.  */
-      (*adapt_count)--;
+      atomic_store_relaxed (adapt_count,
+			    atomic_load_relaxed (adapt_count) - 1);
       goto use_lock;
     }
 
@@ -74,8 +81,10 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 	      /* In a non-nested transaction there is no need to abort,
 		 which is expensive.  */
 	      __builtin_tend ();
+	      /* Don't try to use transactions for the next couple of times.
+		 See above for why relaxed MO is sufficient.  */
 	      if (aconf.skip_lock_busy > 0)
-		*adapt_count = aconf.skip_lock_busy;
+		atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
 	      goto use_lock;
 	    }
 	  else /* nesting depth is > 1 */
@@ -101,18 +110,20 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 	      /* A persistent abort (cc 1 or 3) indicates that a retry is
 		 probably futile.  Use the normal locking now and for the
 		 next couple of calls.
-		 Be careful to avoid writing to the lock.  */
+		 Be careful to avoid writing to the lock.  See above for why
+		 relaxed MO is sufficient.  */
 	      if (aconf.skip_lock_internal_abort > 0)
-		*adapt_count = aconf.skip_lock_internal_abort;
+		atomic_store_relaxed (adapt_count,
+				      aconf.skip_lock_internal_abort);
 	      goto use_lock;
 	    }
 	}
     }
 
   /* Same logic as above, but for for a number of temporary failures in a
-     row.  */
+     row.  See above for why relaxed MO is sufficient.  */
   if (aconf.skip_lock_out_of_tbegin_retries > 0 && aconf.try_tbegin > 0)
-    *adapt_count = aconf.skip_lock_out_of_tbegin_retries;
+    atomic_store_relaxed (adapt_count, aconf.skip_lock_out_of_tbegin_retries);
 
   use_lock:
   return LLL_LOCK ((*futex), private);
diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
index 3d5a994..a3252b8 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
@@ -49,8 +49,10 @@ __lll_trylock_elision (int *futex, short *adapt_count)
       __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
     }
 
-  /* Only try a transaction if it's worth it.  */
-  if (*adapt_count <= 0)
+  /* Only try a transaction if it's worth it.  See __lll_lock_elision for
+     why we need atomic accesses.  Relaxed MO is sufficient because this is
+     just a hint.  */
+  if (atomic_load_relaxed (adapt_count) <= 0)
     {
       unsigned status;
 
@@ -65,9 +67,10 @@ __lll_trylock_elision (int *futex, short *adapt_count)
 	  __builtin_tend ();
 	  /* Note: Changing the adapt_count here might abort a transaction on a
 	     different cpu, but that could happen anyway when the futex is
-	     acquired, so there's no need to check the nesting depth here.  */
+	     acquired, so there's no need to check the nesting depth here.
+	     See above for why relaxed MO is sufficient.  */
 	  if (aconf.skip_lock_busy > 0)
-	    *adapt_count = aconf.skip_lock_busy;
+	    atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
 	}
       else
 	{
@@ -87,7 +90,8 @@ __lll_trylock_elision (int *futex, short *adapt_count)
     {
       /* Lost updates are possible, but harmless.  Due to races this might lead
 	 to *adapt_count becoming less than zero.  */
-      (*adapt_count)--;
+      atomic_store_relaxed (adapt_count,
+			    atomic_load_relaxed (adapt_count) - 1);
     }
 
   return lll_trylock (*futex);
-- 
2.5.5

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2016-12-06 13:52 [PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
  2016-12-06 13:52 ` [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c Stefan Liebler
@ 2016-12-06 13:52 ` Stefan Liebler
  2016-12-22 12:54   ` Florian Weimer
  2017-01-10 16:34   ` Torvald Riegel
  2016-12-06 13:52 ` [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock Stefan Liebler
  2016-12-13  8:38 ` [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
  3 siblings, 2 replies; 35+ messages in thread
From: Stefan Liebler @ 2016-12-06 13:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: Stefan Liebler

This patch defines __libc_tbegin, __libc_tend, __libc_tabort and
__libc_tx_nesting_depth in htm.h which replaces the direct usage of
equivalent gcc builtins.

We have to use an own inline assembly instead of __builtin_tbegin,
as tbegin has to filter program interruptions which can't be done with
the builtin.  Before this change, e.g. a segmentation fault within a
transaction, leads to a coredump where the instruction pointer points
behind the tbegin instruction instead of real failing one.
Now the transaction aborts and the code should be reexecuted by the
fallback path without transactions.  The segmentation fault will
produce a coredump with the real failing instruction.

The fpc is not saved before starting the transaction.  If e.g. the
rounging mode is changed and the transaction is aborting afterwards,
the builtin will not restore the fpc.  This is now done with the
__libc_tbegin macro.

Now the call saved fprs have to be saved / restored in the
__libc_tbegin macro.  Using the gcc builtin had forced the saving /
restoring of fprs at begin / end of e.g. __lll_lock_elision function.
The new macro saves these fprs before tbegin instruction and only
restores them on a transaction abort.  Restoring is not needed on
a successfully started transaction.

The used inline assembly does not clobber the fprs / vrs!
Clobbering the latter ones would force the compiler to save / restore
the call saved fprs as those overlap with the vrs, but they only
need to be restored if the transaction fails.  Thus the user of the
tbegin macros has to compile the file / function with -msoft-float.
It prevents gcc from using fprs / vrs.

ChangeLog:

	* sysdeps/unix/sysv/linux/s390/Makefile (elision-CFLAGS):
	Add -msoft-float.
	* sysdeps/unix/sysv/linux/s390/htm.h: New File.
	* sysdeps/unix/sysv/linux/s390/elision-lock.c:
	Use __libc_t* transaction macros instead of __builtin_t*.
	* sysdeps/unix/sysv/linux/s390/elision-trylock.c: Likewise.
	* sysdeps/unix/sysv/linux/s390/elision-unlock.c: Likewise.
---
 sysdeps/unix/sysv/linux/s390/Makefile          |   2 +-
 sysdeps/unix/sysv/linux/s390/elision-lock.c    |  16 +--
 sysdeps/unix/sysv/linux/s390/elision-trylock.c |  16 +--
 sysdeps/unix/sysv/linux/s390/elision-unlock.c  |   6 +-
 sysdeps/unix/sysv/linux/s390/htm.h             | 149 +++++++++++++++++++++++++
 5 files changed, 164 insertions(+), 25 deletions(-)
 create mode 100644 sysdeps/unix/sysv/linux/s390/htm.h

diff --git a/sysdeps/unix/sysv/linux/s390/Makefile b/sysdeps/unix/sysv/linux/s390/Makefile
index f8ed013..3867c33 100644
--- a/sysdeps/unix/sysv/linux/s390/Makefile
+++ b/sysdeps/unix/sysv/linux/s390/Makefile
@@ -22,7 +22,7 @@ ifeq ($(enable-lock-elision),yes)
 libpthread-sysdep_routines += elision-lock elision-unlock elision-timed \
 			      elision-trylock
 
-elision-CFLAGS = -mhtm
+elision-CFLAGS = -mhtm -msoft-float
 CFLAGS-elision-lock.c = $(elision-CFLAGS)
 CFLAGS-elision-timed.c = $(elision-CFLAGS)
 CFLAGS-elision-trylock.c = $(elision-CFLAGS)
diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
index 1876d21..48cc3db 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
@@ -19,7 +19,7 @@
 #include <pthread.h>
 #include <pthreadP.h>
 #include <lowlevellock.h>
-#include <htmintrin.h>
+#include <htm.h>
 #include <elision-conf.h>
 #include <stdint.h>
 
@@ -60,27 +60,23 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
       goto use_lock;
     }
 
-  __asm__ volatile (".machinemode \"zarch_nohighgprs\"\n\t"
-		    ".machine \"all\""
-		    : : : "memory");
-
   int try_tbegin;
   for (try_tbegin = aconf.try_tbegin;
        try_tbegin > 0;
        try_tbegin--)
     {
-      unsigned status;
+      int status;
       if (__builtin_expect
-	  ((status = __builtin_tbegin((void *)0)) == _HTM_TBEGIN_STARTED, 1))
+	  ((status = __libc_tbegin ((void *) 0)) == _HTM_TBEGIN_STARTED, 1))
 	{
 	  if (*futex == 0)
 	    return 0;
 	  /* Lock was busy.  Fall back to normal locking.  */
-	  if (__builtin_expect (__builtin_tx_nesting_depth (), 1))
+	  if (__builtin_expect (__libc_tx_nesting_depth (), 1))
 	    {
 	      /* In a non-nested transaction there is no need to abort,
 		 which is expensive.  */
-	      __builtin_tend ();
+	      __libc_tend ();
 	      /* Don't try to use transactions for the next couple of times.
 		 See above for why relaxed MO is sufficient.  */
 	      if (aconf.skip_lock_busy > 0)
@@ -100,7 +96,7 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 		 because using the default lock with the inner mutex
 		 would abort the outer transaction.
 	      */
-	      __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
+	      __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
 	    }
 	}
       else
diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
index a3252b8..e21fc26 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
@@ -19,7 +19,7 @@
 #include <pthread.h>
 #include <pthreadP.h>
 #include <lowlevellock.h>
-#include <htmintrin.h>
+#include <htm.h>
 #include <elision-conf.h>
 
 #define aconf __elision_aconf
@@ -30,15 +30,11 @@
 int
 __lll_trylock_elision (int *futex, short *adapt_count)
 {
-  __asm__ __volatile__ (".machinemode \"zarch_nohighgprs\"\n\t"
-			".machine \"all\""
-			: : : "memory");
-
   /* Implement POSIX semantics by forbiding nesting elided trylocks.
      Sorry.  After the abort the code is re-executed
      non transactional and if the lock was already locked
      return an error.  */
-  if (__builtin_tx_nesting_depth () > 0)
+  if (__libc_tx_nesting_depth () > 0)
     {
       /* Note that this abort may terminate an outermost transaction that
 	 was created outside glibc.
@@ -46,7 +42,7 @@ __lll_trylock_elision (int *futex, short *adapt_count)
 	 them to use the default lock instead of retrying transactions
 	 until their try_tbegin is zero.
       */
-      __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
+      __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
     }
 
   /* Only try a transaction if it's worth it.  See __lll_lock_elision for
@@ -54,17 +50,17 @@ __lll_trylock_elision (int *futex, short *adapt_count)
      just a hint.  */
   if (atomic_load_relaxed (adapt_count) <= 0)
     {
-      unsigned status;
+      int status;
 
       if (__builtin_expect
-	  ((status = __builtin_tbegin ((void *)0)) == _HTM_TBEGIN_STARTED, 1))
+	  ((status = __libc_tbegin ((void *) 0)) == _HTM_TBEGIN_STARTED, 1))
 	{
 	  if (*futex == 0)
 	    return 0;
 	  /* Lock was busy.  Fall back to normal locking.  */
 	  /* Since we are in a non-nested transaction there is no need to abort,
 	     which is expensive.  */
-	  __builtin_tend ();
+	  __libc_tend ();
 	  /* Note: Changing the adapt_count here might abort a transaction on a
 	     different cpu, but that could happen anyway when the futex is
 	     acquired, so there's no need to check the nesting depth here.
diff --git a/sysdeps/unix/sysv/linux/s390/elision-unlock.c b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
index 483abe1..0b1ade9 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-unlock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
@@ -18,6 +18,7 @@
 
 #include <pthreadP.h>
 #include <lowlevellock.h>
+#include <htm.h>
 
 int
 __lll_unlock_elision(int *futex, int private)
@@ -27,10 +28,7 @@ __lll_unlock_elision(int *futex, int private)
      have closed the transaction, but that is impossible to detect reliably.  */
   if (*futex == 0)
     {
-      __asm__ volatile (".machinemode \"zarch_nohighgprs\"\n\t"
-			".machine \"all\""
-			: : : "memory");
-      __builtin_tend();
+      __libc_tend ();
     }
   else
     lll_unlock ((*futex), private);
diff --git a/sysdeps/unix/sysv/linux/s390/htm.h b/sysdeps/unix/sysv/linux/s390/htm.h
new file mode 100644
index 0000000..6b4e8f4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/s390/htm.h
@@ -0,0 +1,149 @@
+/* Shared HTM header.  Work around false transactional execution facility
+   intrinsics.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _HTM_H
+#define _HTM_H 1
+
+#include <htmintrin.h>
+
+#ifdef __s390x__
+# define TX_FPRS_BYTES 64
+# define TX_SAVE_FPRS						\
+  "   std %%f8, 0(%[R_FPRS])\n\t"				\
+  "   std %%f9, 8(%[R_FPRS])\n\t"				\
+  "   std %%f10, 16(%[R_FPRS])\n\t"				\
+  "   std %%f11, 24(%[R_FPRS])\n\t"				\
+  "   std %%f12, 32(%[R_FPRS])\n\t"				\
+  "   std %%f13, 40(%[R_FPRS])\n\t"				\
+  "   std %%f14, 48(%[R_FPRS])\n\t"				\
+  "   std %%f15, 56(%[R_FPRS])\n\t"
+
+# define TX_RESTORE_FPRS					\
+  "   ld %%f8, 0(%[R_FPRS])\n\t"				\
+  "   ld %%f9, 8(%[R_FPRS])\n\t"				\
+  "   ld %%f10, 16(%[R_FPRS])\n\t"				\
+  "   ld %%f11, 24(%[R_FPRS])\n\t"				\
+  "   ld %%f12, 32(%[R_FPRS])\n\t"				\
+  "   ld %%f13, 40(%[R_FPRS])\n\t"				\
+  "   ld %%f14, 48(%[R_FPRS])\n\t"				\
+  "   ld %%f15, 56(%[R_FPRS])\n\t"
+
+#else
+
+# define TX_FPRS_BYTES 16
+# define TX_SAVE_FPRS						\
+  "   std %%f4, 0(%[R_FPRS])\n\t"				\
+  "   std %%f6, 8(%[R_FPRS])\n\t"
+
+# define TX_RESTORE_FPRS					\
+  "   ld %%f4, 0(%[R_FPRS])\n\t"				\
+  "   ld %%f6, 8(%[R_FPRS])\n\t"
+
+#endif /* ! __s390x__  */
+
+/* Use own inline assembly instead of __builtin_tbegin, as tbegin
+   has to filter program interruptions which can't be done with the builtin.
+   Now the fprs have to be saved / restored here, too.
+   The fpc is also not saved / restored with the builtin.
+   The used inline assembly does not clobber the volatile fprs / vrs!
+   Clobbering the latter ones would force the compiler to save / restore
+   the call saved fprs as those overlap with the vrs, but they only need to be
+   restored if the transaction fails but not if the transaction is successfully
+   started.  Thus the user of the tbegin macros in this header file has to
+   compile the file / function with -msoft-float.  It prevents gcc from using
+   fprs / vrs.  */
+#define __libc_tbegin(tdb)						\
+  ({ int __ret;								\
+     int __fpc;								\
+     char __fprs[TX_FPRS_BYTES];					\
+     __asm__ __volatile__ (".machine push\n\t"				\
+			   ".machinemode \"zarch_nohighgprs\"\n\t"	\
+			   ".machine \"all\"\n\t"			\
+			   /* Save state at the outermost transaction.	\
+			      As extracting nesting depth is expensive	\
+			      on at least zEC12, save fprs at inner	\
+			      transactions, too.			\
+			      The fpc and fprs are saved here as they	\
+			      are not saved by tbegin.  There exist no	\
+			      call-saved vrs, thus they are not saved	\
+			      here.  */					\
+			   "   efpc %[R_FPC]\n\t"			\
+			   TX_SAVE_FPRS					\
+			   /* Begin transaction: save all gprs, allow	\
+			      ar modification and fp operations.  Some	\
+			      program-interruptions (e.g. a null	\
+			      pointer access) are filtered and the	\
+			      trancsaction will abort.  In this case	\
+			      the normal lock path will execute it	\
+			      again and result in a core dump wich does	\
+			      now show at tbegin but the real executed	\
+			      instruction.  */				\
+			   "   tbegin 0, 0xFF0E\n\t"			\
+			   /* Branch away in abort case (this is the	\
+			      prefered sequence.  See PoP in chapter 5	\
+			      Transactional-Execution Facility		\
+			      Operation).  */				\
+			   "   jnz 0f\n\t"				\
+			   /* Transaction has successfully started.  */	\
+			   "   lhi %[R_RET], 0\n\t"			\
+			   "   j 1f\n\t"				\
+			   /* Transaction has aborted.  Now we are at	\
+			      the outermost transaction.  Restore fprs	\
+			      and fpc. */				\
+			   "0: ipm %[R_RET]\n\t"			\
+			   "   srl %[R_RET], 28\n\t"			\
+			   "   sfpc %[R_FPC]\n\t"			\
+			   TX_RESTORE_FPRS				\
+			   "1:\n\t"					\
+			   ".machine pop\n"				\
+			   : [R_RET] "=&d" (__ret),			\
+			     [R_FPC] "=&d" (__fpc)			\
+			   : [R_FPRS] "a" (__fprs)			\
+			   : "cc", "memory");				\
+     __ret;								\
+     })
+
+/* These builtins are correct.  Use them.  */
+#define __libc_tend()							\
+  ({ __asm__ __volatile__ (".machine push\n\t"				\
+			   ".machinemode \"zarch_nohighgprs\"\n\t"	\
+			   ".machine \"all\"\n\t");			\
+    int __ret = __builtin_tend ();					\
+    __asm__ __volatile__ (".machine pop");				\
+    __ret;								\
+  })
+
+#define __libc_tabort(abortcode)					\
+  __asm__ __volatile__ (".machine push\n\t"				\
+			".machinemode \"zarch_nohighgprs\"\n\t"		\
+			".machine \"all\"\n\t");			\
+  __builtin_tabort (abortcode);						\
+  __asm__ __volatile__ (".machine pop")
+
+#define __libc_tx_nesting_depth() \
+  ({ __asm__ __volatile__ (".machine push\n\t"				\
+			   ".machinemode \"zarch_nohighgprs\"\n\t"	\
+			   ".machine \"all\"\n\t");			\
+    int __ret = __builtin_tx_nesting_depth ();				\
+    __asm__ __volatile__ (".machine pop");				\
+    __ret;								\
+  })
+
+#endif
-- 
2.5.5

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock.
  2016-12-06 13:52 [PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
  2016-12-06 13:52 ` [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c Stefan Liebler
  2016-12-06 13:52 ` [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin Stefan Liebler
@ 2016-12-06 13:52 ` Stefan Liebler
  2017-01-11 10:53   ` Torvald Riegel
  2016-12-13  8:38 ` [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
  3 siblings, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2016-12-06 13:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: Stefan Liebler

This patch decrements the adapt_count while unlocking the futex
instead of before aquiring the futex as it is done on power, too.
Furthermore a transaction is only started if the futex is currently free.
This check is done after starting the transaction, too.
If the futex is not free and the transaction nesting depth is one,
we can simply end the started transaction instead of aborting it.
The implementation of this check was faulty as it always ended the
started transaction.  By using the fallback path, the the outermost
transaction was aborted.  Now the outermost transaction is aborted
directly.

This patch also adds some commentary and aligns the code in
elision-trylock.c to the code in elision-lock.c as possible.

ChangeLog:

	* sysdeps/unix/sysv/linux/s390/lowlevellock.h
	(__lll_unlock_elision, lll_unlock_elision): Add adapt_count argument.
	* sysdeps/unix/sysv/linux/s390/elision-lock.c:
	(__lll_lock_elision): Decrement adapt_count while unlocking
	instead of before locking.
	* sysdeps/unix/sysv/linux/s390/elision-trylock.c
	(__lll_trylock_elision): Likewise.
	* sysdeps/unix/sysv/linux/s390/elision-unlock.c:
	(__lll_unlock_elision): Likewise.
---
 sysdeps/unix/sysv/linux/s390/elision-lock.c    | 37 ++++++++-------
 sysdeps/unix/sysv/linux/s390/elision-trylock.c | 62 ++++++++++++++------------
 sysdeps/unix/sysv/linux/s390/elision-unlock.c  | 29 ++++++++++--
 sysdeps/unix/sysv/linux/s390/lowlevellock.h    |  4 +-
 4 files changed, 78 insertions(+), 54 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
index 3dd7fbc..4a7d546 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
@@ -50,31 +50,30 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
      critical section uses lock elision) and outside of transactions.  Thus,
      we need to use atomic accesses to avoid data races.  However, the
      value of adapt_count is just a hint, so relaxed MO accesses are
-     sufficient.  */
-  if (atomic_load_relaxed (adapt_count) > 0)
-    {
-      /* Lost updates are possible, but harmless.  Due to races this might lead
-	 to *adapt_count becoming less than zero.  */
-      atomic_store_relaxed (adapt_count,
-			    atomic_load_relaxed (adapt_count) - 1);
-      goto use_lock;
-    }
-
-  if (aconf.try_tbegin > 0)
+     sufficient.
+     Do not begin a transaction if another cpu has locked the
+     futex with normal locking.  If adapt_count is zero, it remains and the
+     next pthread_mutex_lock call will try to start a transaction again.  */
+    if (atomic_load_relaxed (futex) == 0
+	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
     {
       int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);
       if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
 			    _HTM_TBEGIN_STARTED))
 	{
-	  if (__builtin_expect (*futex == 0, 1))
+	  /* Check the futex to make sure nobody has touched it in the
+	     mean time.  This forces the futex into the cache and makes
+	     sure the transaction aborts if some other cpu uses the
+	     lock (writes the futex).  */
+	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
 	    /* Lock was free.  Return to user code in a transaction.  */
 	    return 0;
 
 	  /* Lock was busy.  Fall back to normal locking.  */
-	  if (__builtin_expect (__libc_tx_nesting_depth (), 1))
+	  if (__builtin_expect (__libc_tx_nesting_depth () <= 1, 1))
 	    {
 	      /* In a non-nested transaction there is no need to abort,
-		 which is expensive.  */
+		 which is expensive.  Simply end the started transaction.  */
 	      __libc_tend ();
 	      /* Don't try to use transactions for the next couple of times.
 		 See above for why relaxed MO is sufficient.  */
@@ -92,9 +91,9 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 		 is zero.
 		 The adapt_count of this inner mutex is not changed,
 		 because using the default lock with the inner mutex
-		 would abort the outer transaction.
-	      */
+		 would abort the outer transaction.  */
 	      __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
+	      __builtin_unreachable ();
 	    }
 	}
       else if (status != _HTM_TBEGIN_TRANSIENT)
@@ -110,15 +109,15 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 	}
       else
 	{
-	  /* Same logic as above, but for for a number of temporary failures in
-	     a row.  */
+	  /* The transaction failed for some retries with
+	     _HTM_TBEGIN_TRANSIENT.  Use the normal locking now and for the
+	     next couple of calls.  */
 	  if (aconf.skip_lock_out_of_tbegin_retries > 0)
 	    atomic_store_relaxed (adapt_count,
 				  aconf.skip_lock_out_of_tbegin_retries);
 	}
     }
 
-  use_lock:
   /* Use normal locking as fallback path if transaction does not succeed.  */
   return LLL_LOCK ((*futex), private);
 }
diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
index e21fc26..dee66d4 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
@@ -43,23 +43,36 @@ __lll_trylock_elision (int *futex, short *adapt_count)
 	 until their try_tbegin is zero.
       */
       __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
+      __builtin_unreachable ();
     }
 
-  /* Only try a transaction if it's worth it.  See __lll_lock_elision for
-     why we need atomic accesses.  Relaxed MO is sufficient because this is
-     just a hint.  */
-  if (atomic_load_relaxed (adapt_count) <= 0)
+  /* adapt_count can be accessed concurrently; these accesses can be both
+     inside of transactions (if critical sections are nested and the outer
+     critical section uses lock elision) and outside of transactions.  Thus,
+     we need to use atomic accesses to avoid data races.  However, the
+     value of adapt_count is just a hint, so relaxed MO accesses are
+     sufficient.
+     Do not begin a transaction if another cpu has locked the
+     futex with normal locking.  If adapt_count is zero, it remains and the
+     next pthread_mutex_lock call will try to start a transaction again.  */
+    if (atomic_load_relaxed (futex) == 0
+	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
     {
-      int status;
-
-      if (__builtin_expect
-	  ((status = __libc_tbegin ((void *) 0)) == _HTM_TBEGIN_STARTED, 1))
+      int status = __libc_tbegin ((void *) 0);
+      if (__builtin_expect (status  == _HTM_TBEGIN_STARTED,
+			    _HTM_TBEGIN_STARTED))
 	{
-	  if (*futex == 0)
+	  /* Check the futex to make sure nobody has touched it in the
+	     mean time.  This forces the futex into the cache and makes
+	     sure the transaction aborts if some other cpu uses the
+	     lock (writes the futex).  */
+	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
+	    /* Lock was free.  Return to user code in a transaction.  */
 	    return 0;
-	  /* Lock was busy.  Fall back to normal locking.  */
-	  /* Since we are in a non-nested transaction there is no need to abort,
-	     which is expensive.  */
+
+	  /* Lock was busy.  Fall back to normal locking.  Since we are in
+	     a non-nested transaction there is no need to abort, which is
+	     expensive.  Simply end the started transaction.  */
 	  __libc_tend ();
 	  /* Note: Changing the adapt_count here might abort a transaction on a
 	     different cpu, but that could happen anyway when the futex is
@@ -68,27 +81,18 @@ __lll_trylock_elision (int *futex, short *adapt_count)
 	  if (aconf.skip_lock_busy > 0)
 	    atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
 	}
-      else
+      else if (status != _HTM_TBEGIN_TRANSIENT)
 	{
-	  if (status != _HTM_TBEGIN_TRANSIENT)
-	    {
-	      /* A persistent abort (cc 1 or 3) indicates that a retry is
-		 probably futile.  Use the normal locking now and for the
-		 next couple of calls.
-		 Be careful to avoid writing to the lock.  */
-	      if (aconf.skip_trylock_internal_abort > 0)
-		*adapt_count = aconf.skip_trylock_internal_abort;
-	    }
+	  /* A persistent abort (cc 1 or 3) indicates that a retry is
+	     probably futile.  Use the normal locking now and for the
+	     next couple of calls.
+	     Be careful to avoid writing to the lock.  */
+	  if (aconf.skip_trylock_internal_abort > 0)
+	    *adapt_count = aconf.skip_trylock_internal_abort;
 	}
       /* Could do some retries here.  */
     }
-  else
-    {
-      /* Lost updates are possible, but harmless.  Due to races this might lead
-	 to *adapt_count becoming less than zero.  */
-      atomic_store_relaxed (adapt_count,
-			    atomic_load_relaxed (adapt_count) - 1);
-    }
 
+  /* Use normal locking as fallback path if transaction does not succeed.  */
   return lll_trylock (*futex);
 }
diff --git a/sysdeps/unix/sysv/linux/s390/elision-unlock.c b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
index 0b1ade9..e68d970 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-unlock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
@@ -21,16 +21,37 @@
 #include <htm.h>
 
 int
-__lll_unlock_elision(int *futex, int private)
+__lll_unlock_elision(int *futex, short *adapt_count, int private)
 {
   /* If the lock is free, we elided the lock earlier.  This does not
      necessarily mean that we are in a transaction, because the user code may
-     have closed the transaction, but that is impossible to detect reliably.  */
-  if (*futex == 0)
+     have closed the transaction, but that is impossible to detect reliably.
+     Relaxed MO access to futex is sufficient as we only need a hint, if we
+     started a transaction or acquired the futex in e.g. elision-lock.c.  */
+  if (atomic_load_relaxed (futex) == 0)
     {
       __libc_tend ();
     }
   else
-    lll_unlock ((*futex), private);
+    {
+      /* Update the adapt_count while unlocking before completing the critical
+	 section.  adapt_count is accessed concurrently outside of a
+	 transaction or an aquired lock e.g. in elision-lock.c so we need to use
+	 atomic accesses.  However, the value of adapt_count is just a hint, so
+	 relaxed MO accesses are sufficient.
+	 If adapt_count would be decremented while locking, multiple
+	 CPUs trying to lock the locked mutex will decrement adapt_count to
+	 zero and another CPU will try to start a transaction, which will be
+	 immediately aborted as the mutex is locked.
+	 If adapt_count would be decremented while unlocking after completing
+	 the critical section, possible waiters will be waked up before
+	 decrementing the adapt_count.  Those waked up waiters could have
+	 destroyed and freed this mutex!  */
+      short adapt_count_val = atomic_load_relaxed (adapt_count);
+      if (adapt_count_val > 0)
+	atomic_store_relaxed (adapt_count, adapt_count_val - 1);
+
+      lll_unlock ((*futex), private);
+    }
   return 0;
 }
diff --git a/sysdeps/unix/sysv/linux/s390/lowlevellock.h b/sysdeps/unix/sysv/linux/s390/lowlevellock.h
index 8d564ed..c60f4f7 100644
--- a/sysdeps/unix/sysv/linux/s390/lowlevellock.h
+++ b/sysdeps/unix/sysv/linux/s390/lowlevellock.h
@@ -50,7 +50,7 @@ extern int __lll_timedlock_elision
 extern int __lll_lock_elision (int *futex, short *adapt_count, int private)
   attribute_hidden;
 
-extern int __lll_unlock_elision(int *futex, int private)
+extern int __lll_unlock_elision(int *futex, short *adapt_count, int private)
   attribute_hidden;
 
 extern int __lll_trylock_elision(int *futex, short *adapt_count)
@@ -59,7 +59,7 @@ extern int __lll_trylock_elision(int *futex, short *adapt_count)
 # define lll_lock_elision(futex, adapt_count, private) \
   __lll_lock_elision (&(futex), &(adapt_count), private)
 #  define lll_unlock_elision(futex, adapt_count, private) \
-  __lll_unlock_elision (&(futex), private)
+  __lll_unlock_elision (&(futex), &(adapt_count), private)
 #  define lll_trylock_elision(futex, adapt_count) \
   __lll_trylock_elision(&(futex), &(adapt_count))
 #endif  /* ENABLE_LOCK_ELISION */
-- 
2.5.5

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
  2016-12-06 13:52 [PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
                   ` (2 preceding siblings ...)
  2016-12-06 13:52 ` [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock Stefan Liebler
@ 2016-12-13  8:38 ` Stefan Liebler
  2016-12-16 17:12   ` Stefan Liebler
  2016-12-16 18:14   ` Torvald Riegel
  3 siblings, 2 replies; 35+ messages in thread
From: Stefan Liebler @ 2016-12-13  8:38 UTC (permalink / raw)
  To: libc-alpha; +Cc: Torvald Riegel

ping.

On 12/06/2016 02:51 PM, Stefan Liebler wrote:
> This uses atomic operations to access lock elision metadata that is accessed
> concurrently (ie, adapt_count fields).  The size of the data is less than a
> word but accessed only with atomic loads and stores.
>
> See also x86 commit ca6e601a9d4a72b3699cca15bad12ac1716bf49a:
> "Use C11-like atomics instead of plain memory accesses in x86 lock elision."
>
> ChangeLog:
>
> 	* sysdeps/unix/sysv/linux/s390/elision-lock.c
> 	(__lll_lock_elision): Use atomics to load / store adapt_count.
> 	* sysdeps/unix/sysv/linux/s390/elision-trylock.c
> 	(__lll_trylock_elision): Likewise.
> ---
>  sysdeps/unix/sysv/linux/s390/elision-lock.c    | 25 ++++++++++++++++++-------
>  sysdeps/unix/sysv/linux/s390/elision-trylock.c | 14 +++++++++-----
>  2 files changed, 27 insertions(+), 12 deletions(-)
>
> diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> index ecb507e..1876d21 100644
> --- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
> +++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> @@ -45,11 +45,18 @@
>  int
>  __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>  {
> -  if (*adapt_count > 0)
> +  /* adapt_count can be accessed concurrently; these accesses can be both
> +     inside of transactions (if critical sections are nested and the outer
> +     critical section uses lock elision) and outside of transactions.  Thus,
> +     we need to use atomic accesses to avoid data races.  However, the
> +     value of adapt_count is just a hint, so relaxed MO accesses are
> +     sufficient.  */
> +  if (atomic_load_relaxed (adapt_count) > 0)
>      {
>        /* Lost updates are possible, but harmless.  Due to races this might lead
>  	 to *adapt_count becoming less than zero.  */
> -      (*adapt_count)--;
> +      atomic_store_relaxed (adapt_count,
> +			    atomic_load_relaxed (adapt_count) - 1);
>        goto use_lock;
>      }
>
> @@ -74,8 +81,10 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>  	      /* In a non-nested transaction there is no need to abort,
>  		 which is expensive.  */
>  	      __builtin_tend ();
> +	      /* Don't try to use transactions for the next couple of times.
> +		 See above for why relaxed MO is sufficient.  */
>  	      if (aconf.skip_lock_busy > 0)
> -		*adapt_count = aconf.skip_lock_busy;
> +		atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
>  	      goto use_lock;
>  	    }
>  	  else /* nesting depth is > 1 */
> @@ -101,18 +110,20 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>  	      /* A persistent abort (cc 1 or 3) indicates that a retry is
>  		 probably futile.  Use the normal locking now and for the
>  		 next couple of calls.
> -		 Be careful to avoid writing to the lock.  */
> +		 Be careful to avoid writing to the lock.  See above for why
> +		 relaxed MO is sufficient.  */
>  	      if (aconf.skip_lock_internal_abort > 0)
> -		*adapt_count = aconf.skip_lock_internal_abort;
> +		atomic_store_relaxed (adapt_count,
> +				      aconf.skip_lock_internal_abort);
>  	      goto use_lock;
>  	    }
>  	}
>      }
>
>    /* Same logic as above, but for for a number of temporary failures in a
> -     row.  */
> +     row.  See above for why relaxed MO is sufficient.  */
>    if (aconf.skip_lock_out_of_tbegin_retries > 0 && aconf.try_tbegin > 0)
> -    *adapt_count = aconf.skip_lock_out_of_tbegin_retries;
> +    atomic_store_relaxed (adapt_count, aconf.skip_lock_out_of_tbegin_retries);
>
>    use_lock:
>    return LLL_LOCK ((*futex), private);
> diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
> index 3d5a994..a3252b8 100644
> --- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
> +++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
> @@ -49,8 +49,10 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>        __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
>      }
>
> -  /* Only try a transaction if it's worth it.  */
> -  if (*adapt_count <= 0)
> +  /* Only try a transaction if it's worth it.  See __lll_lock_elision for
> +     why we need atomic accesses.  Relaxed MO is sufficient because this is
> +     just a hint.  */
> +  if (atomic_load_relaxed (adapt_count) <= 0)
>      {
>        unsigned status;
>
> @@ -65,9 +67,10 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>  	  __builtin_tend ();
>  	  /* Note: Changing the adapt_count here might abort a transaction on a
>  	     different cpu, but that could happen anyway when the futex is
> -	     acquired, so there's no need to check the nesting depth here.  */
> +	     acquired, so there's no need to check the nesting depth here.
> +	     See above for why relaxed MO is sufficient.  */
>  	  if (aconf.skip_lock_busy > 0)
> -	    *adapt_count = aconf.skip_lock_busy;
> +	    atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
>  	}
>        else
>  	{
> @@ -87,7 +90,8 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>      {
>        /* Lost updates are possible, but harmless.  Due to races this might lead
>  	 to *adapt_count becoming less than zero.  */
> -      (*adapt_count)--;
> +      atomic_store_relaxed (adapt_count,
> +			    atomic_load_relaxed (adapt_count) - 1);
>      }
>
>    return lll_trylock (*futex);
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
  2016-12-13  8:38 ` [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
@ 2016-12-16 17:12   ` Stefan Liebler
  2016-12-16 18:14   ` Torvald Riegel
  1 sibling, 0 replies; 35+ messages in thread
From: Stefan Liebler @ 2016-12-16 17:12 UTC (permalink / raw)
  To: libc-alpha; +Cc: Torvald Riegel

On 12/13/2016 09:38 AM, Stefan Liebler wrote:
> ping.
>
Any comments, objections?
Otherwise I plan to commit this s390 specific patches next week.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
  2016-12-13  8:38 ` [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
  2016-12-16 17:12   ` Stefan Liebler
@ 2016-12-16 18:14   ` Torvald Riegel
  2016-12-20 14:28     ` Stefan Liebler
  1 sibling, 1 reply; 35+ messages in thread
From: Torvald Riegel @ 2016-12-16 18:14 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Tue, 2016-12-13 at 09:38 +0100, Stefan Liebler wrote:
> ping.

Sorry, I've been busy with the robust mutex bugs.

This patch is OK.  Thanks.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
  2016-12-16 18:14   ` Torvald Riegel
@ 2016-12-20 14:28     ` Stefan Liebler
  2016-12-22  9:34       ` Torvald Riegel
  0 siblings, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2016-12-20 14:28 UTC (permalink / raw)
  To: libc-alpha

On 12/16/2016 07:14 PM, Torvald Riegel wrote:
> On Tue, 2016-12-13 at 09:38 +0100, Stefan Liebler wrote:
>> ping.
>
> Sorry, I've been busy with the robust mutex bugs.
>
> This patch is OK.  Thanks.
>
Thanks. Committed.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
  2016-12-20 14:28     ` Stefan Liebler
@ 2016-12-22  9:34       ` Torvald Riegel
  2017-01-10 11:34         ` Torvald Riegel
  2017-01-11 16:06         ` Stefan Liebler
  0 siblings, 2 replies; 35+ messages in thread
From: Torvald Riegel @ 2016-12-22  9:34 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha, Carlos O'Donell

On Tue, 2016-12-20 at 15:28 +0100, Stefan Liebler wrote:
> On 12/16/2016 07:14 PM, Torvald Riegel wrote:
> > On Tue, 2016-12-13 at 09:38 +0100, Stefan Liebler wrote:
> >> ping.
> >
> > Sorry, I've been busy with the robust mutex bugs.
> >
> > This patch is OK.  Thanks.
> >
> Thanks. Committed.

I've reviewed and ack'ed patch 1 of 4, but it looks like you committed
patches 2-4 as well.  Did anyone review these?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2016-12-06 13:52 ` [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin Stefan Liebler
@ 2016-12-22 12:54   ` Florian Weimer
  2017-01-10 12:32     ` Stefan Liebler
  2017-01-10 16:34   ` Torvald Riegel
  1 sibling, 1 reply; 35+ messages in thread
From: Florian Weimer @ 2016-12-22 12:54 UTC (permalink / raw)
  To: Stefan Liebler, libc-alpha

On 12/06/2016 02:51 PM, Stefan Liebler wrote:

> The used inline assembly does not clobber the fprs / vrs!
> Clobbering the latter ones would force the compiler to save / restore
> the call saved fprs as those overlap with the vrs, but they only
> need to be restored if the transaction fails.  Thus the user of the
> tbegin macros has to compile the file / function with -msoft-float.
> It prevents gcc from using fprs / vrs.

Is it possible to disable the new macros if the file is not compiled 
with -msoft-float?

Thanks,
Florian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
  2016-12-22  9:34       ` Torvald Riegel
@ 2017-01-10 11:34         ` Torvald Riegel
  2017-01-11 16:06         ` Stefan Liebler
  1 sibling, 0 replies; 35+ messages in thread
From: Torvald Riegel @ 2017-01-10 11:34 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha, Carlos O'Donell

On Thu, 2016-12-22 at 10:34 +0100, Torvald Riegel wrote:
> On Tue, 2016-12-20 at 15:28 +0100, Stefan Liebler wrote:
> > On 12/16/2016 07:14 PM, Torvald Riegel wrote:
> > > On Tue, 2016-12-13 at 09:38 +0100, Stefan Liebler wrote:
> > >> ping.
> > >
> > > Sorry, I've been busy with the robust mutex bugs.
> > >
> > > This patch is OK.  Thanks.
> > >
> > Thanks. Committed.
> 
> I've reviewed and ack'ed patch 1 of 4, but it looks like you committed
> patches 2-4 as well.  Did anyone review these?
> 

Ping.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2016-12-22 12:54   ` Florian Weimer
@ 2017-01-10 12:32     ` Stefan Liebler
  0 siblings, 0 replies; 35+ messages in thread
From: Stefan Liebler @ 2017-01-10 12:32 UTC (permalink / raw)
  To: libc-alpha

On 12/22/2016 01:54 PM, Florian Weimer wrote:
> On 12/06/2016 02:51 PM, Stefan Liebler wrote:
>
>> The used inline assembly does not clobber the fprs / vrs!
>> Clobbering the latter ones would force the compiler to save / restore
>> the call saved fprs as those overlap with the vrs, but they only
>> need to be restored if the transaction fails.  Thus the user of the
>> tbegin macros has to compile the file / function with -msoft-float.
>> It prevents gcc from using fprs / vrs.
>
> Is it possible to disable the new macros if the file is not compiled
> with -msoft-float?
Those macros/builtins are only used for the mutex lock elision code in 
elision-*.c files. This code does not use float computation. The 
compiler flag is used to prevent the compiler to perform optimizations 
which use fprs or vrs. At the moment gcc performs no such optimizations.

If you want to use such macros/builtins in other files you'll at least 
have to add -mhtm compiler flag to be able to use those builtins.

Bye
Stefan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2016-12-06 13:52 ` [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin Stefan Liebler
  2016-12-22 12:54   ` Florian Weimer
@ 2017-01-10 16:34   ` Torvald Riegel
  2017-01-12 15:45     ` Florian Weimer
  2017-01-17 15:28     ` Stefan Liebler
  1 sibling, 2 replies; 35+ messages in thread
From: Torvald Riegel @ 2017-01-10 16:34 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha, Florian Weimer

On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
> We have to use an own inline assembly instead of __builtin_tbegin,
> as tbegin has to filter program interruptions which can't be done with
> the builtin.  Before this change, e.g. a segmentation fault within a
> transaction, leads to a coredump where the instruction pointer points
> behind the tbegin instruction instead of real failing one.
> Now the transaction aborts and the code should be reexecuted by the
> fallback path without transactions.  The segmentation fault will
> produce a coredump with the real failing instruction.

I'm not sure that this is always a good thing to do.  It is nicer for
programmers if they can get an instruction pointer that points to where
the faulty access happened.  However:

(1) This can mask failures present in an application, because the
fallback path is not guaranteed to operate on the same data.  It may
never run into this problem.  Can inconsistencies arise due to that?
For example, is a masked segfault still visible through performance
counters or something like that?

(2) This introduces a facility to probe memory for being accessible or
not, considering that you say it masks segfaults.  It seems that this
probing may not be visible to the same extent as possible if a signal
handler were installed.  Is this relevant from a security perspective?
It may not be given that perhaps the user can do that anyway through
direct usage of transactions, or perhaps because the user would have to
be able to check whether the program is currently executing in a
transaction or not.
It might be good to at least mention this in a comment in the code.

> +++ b/sysdeps/unix/sysv/linux/s390/htm.h
> @@ -0,0 +1,149 @@
> +/* Shared HTM header.  Work around false transactional execution facility
> +   intrinsics.
> +
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef _HTM_H
> +#define _HTM_H 1
> +
> +#include <htmintrin.h>
> +
> +#ifdef __s390x__
> +# define TX_FPRS_BYTES 64
> +# define TX_SAVE_FPRS						\
> +  "   std %%f8, 0(%[R_FPRS])\n\t"				\
> +  "   std %%f9, 8(%[R_FPRS])\n\t"				\
> +  "   std %%f10, 16(%[R_FPRS])\n\t"				\
> +  "   std %%f11, 24(%[R_FPRS])\n\t"				\
> +  "   std %%f12, 32(%[R_FPRS])\n\t"				\
> +  "   std %%f13, 40(%[R_FPRS])\n\t"				\
> +  "   std %%f14, 48(%[R_FPRS])\n\t"				\
> +  "   std %%f15, 56(%[R_FPRS])\n\t"
> +
> +# define TX_RESTORE_FPRS					\
> +  "   ld %%f8, 0(%[R_FPRS])\n\t"				\
> +  "   ld %%f9, 8(%[R_FPRS])\n\t"				\
> +  "   ld %%f10, 16(%[R_FPRS])\n\t"				\
> +  "   ld %%f11, 24(%[R_FPRS])\n\t"				\
> +  "   ld %%f12, 32(%[R_FPRS])\n\t"				\
> +  "   ld %%f13, 40(%[R_FPRS])\n\t"				\
> +  "   ld %%f14, 48(%[R_FPRS])\n\t"				\
> +  "   ld %%f15, 56(%[R_FPRS])\n\t"
> +
> +#else
> +
> +# define TX_FPRS_BYTES 16
> +# define TX_SAVE_FPRS						\
> +  "   std %%f4, 0(%[R_FPRS])\n\t"				\
> +  "   std %%f6, 8(%[R_FPRS])\n\t"
> +
> +# define TX_RESTORE_FPRS					\
> +  "   ld %%f4, 0(%[R_FPRS])\n\t"				\
> +  "   ld %%f6, 8(%[R_FPRS])\n\t"
> +
> +#endif /* ! __s390x__  */
> +
> +/* Use own inline assembly instead of __builtin_tbegin, as tbegin
> +   has to filter program interruptions which can't be done with the builtin.
> +   Now the fprs have to be saved / restored here, too.
> +   The fpc is also not saved / restored with the builtin.
> +   The used inline assembly does not clobber the volatile fprs / vrs!
> +   Clobbering the latter ones would force the compiler to save / restore
> +   the call saved fprs as those overlap with the vrs, but they only need to be
> +   restored if the transaction fails but not if the transaction is successfully
> +   started.  Thus the user of the tbegin macros in this header file has to
> +   compile the file / function with -msoft-float.  It prevents gcc from using
> +   fprs / vrs.  */
> +#define __libc_tbegin(tdb)						\
> +  ({ int __ret;								\
> +     int __fpc;								\
> +     char __fprs[TX_FPRS_BYTES];					\
> +     __asm__ __volatile__ (".machine push\n\t"				\
> +			   ".machinemode \"zarch_nohighgprs\"\n\t"	\
> +			   ".machine \"all\"\n\t"			\
> +			   /* Save state at the outermost transaction.	\
> +			      As extracting nesting depth is expensive	\
> +			      on at least zEC12, save fprs at inner	\
> +			      transactions, too.			\
> +			      The fpc and fprs are saved here as they	\
> +			      are not saved by tbegin.  There exist no	\
> +			      call-saved vrs, thus they are not saved	\
> +			      here.  */					\
> +			   "   efpc %[R_FPC]\n\t"			\
> +			   TX_SAVE_FPRS					\
> +			   /* Begin transaction: save all gprs, allow	\
> +			      ar modification and fp operations.  Some	\
> +			      program-interruptions (e.g. a null	\
> +			      pointer access) are filtered and the	\
> +			      trancsaction will abort.  In this case	\
> +			      the normal lock path will execute it	\
> +			      again and result in a core dump wich does	\
> +			      now show at tbegin but the real executed	\
> +			      instruction.  */				\
> +			   "   tbegin 0, 0xFF0E\n\t"			\
> +			   /* Branch away in abort case (this is the	\
> +			      prefered sequence.  See PoP in chapter 5	\
> +			      Transactional-Execution Facility		\
> +			      Operation).  */				\
> +			   "   jnz 0f\n\t"				\
> +			   /* Transaction has successfully started.  */	\
> +			   "   lhi %[R_RET], 0\n\t"			\
> +			   "   j 1f\n\t"				\
> +			   /* Transaction has aborted.  Now we are at	\
> +			      the outermost transaction.  Restore fprs	\
> +			      and fpc. */				\
> +			   "0: ipm %[R_RET]\n\t"			\
> +			   "   srl %[R_RET], 28\n\t"			\
> +			   "   sfpc %[R_FPC]\n\t"			\
> +			   TX_RESTORE_FPRS				\
> +			   "1:\n\t"					\
> +			   ".machine pop\n"				\
> +			   : [R_RET] "=&d" (__ret),			\
> +			     [R_FPC] "=&d" (__fpc)			\
> +			   : [R_FPRS] "a" (__fprs)			\
> +			   : "cc", "memory");				\
> +     __ret;								\
> +     })
> +
> +/* These builtins are correct.  Use them.  */

Is "correct" the right word here?  I suppose they are always correct,
it's just that you want something different for glibc internally.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c.
  2016-12-06 13:52 ` [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c Stefan Liebler
@ 2017-01-10 16:53   ` Torvald Riegel
  2017-01-17 15:28     ` Stefan Liebler
  0 siblings, 1 reply; 35+ messages in thread
From: Torvald Riegel @ 2017-01-10 16:53 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
> This patch implements __libc_tbegin_retry macro which is equivalent to
> gcc builtin __builtin_tbegin_retry, except the changes which were applied
> to __libc_tbegin in the previous patch.
> 
> If tbegin aborts with _HTM_TBEGIN_TRANSIENT.  Then this macros restores
> the fpc, fprs and automatically retries up to retry_cnt tbegins.
> Further saving of the state is omitted as it is already saved in the
> first round.  Before retrying a further transaction, the
> transaction-abort-assist instruction is used to support the cpu.

This looks okay to me on the surface of it, but I don't know anything
about s390 asm.

> Use __libc_tbegin_retry macro.
> ---
>  sysdeps/unix/sysv/linux/s390/elision-lock.c | 50 ++++++++++++++---------------
>  sysdeps/unix/sysv/linux/s390/htm.h          | 36 +++++++++++++++++++--
>  2 files changed, 58 insertions(+), 28 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> index 48cc3db..3dd7fbc 100644
> --- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
> +++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> @@ -60,17 +60,16 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>        goto use_lock;
>      }
>  
> -  int try_tbegin;
> -  for (try_tbegin = aconf.try_tbegin;
> -       try_tbegin > 0;
> -       try_tbegin--)
> +  if (aconf.try_tbegin > 0)
>      {
> -      int status;
> -      if (__builtin_expect
> -	  ((status = __libc_tbegin ((void *) 0)) == _HTM_TBEGIN_STARTED, 1))
> +      int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);

Maybe add a comment that the macro that reminds the reader that the
macro expects a retry count, not how often a transaction should be
tried.

> +      if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
> +			    _HTM_TBEGIN_STARTED))
>  	{
> -	  if (*futex == 0)
> +	  if (__builtin_expect (*futex == 0, 1))
> +	    /* Lock was free.  Return to user code in a transaction.  */
>  	    return 0;
> +
>  	  /* Lock was busy.  Fall back to normal locking.  */
>  	  if (__builtin_expect (__libc_tx_nesting_depth (), 1))
>  	    {
> @@ -81,7 +80,6 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>  		 See above for why relaxed MO is sufficient.  */
>  	      if (aconf.skip_lock_busy > 0)
>  		atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
> -	      goto use_lock;
>  	    }
>  	  else /* nesting depth is > 1 */
>  	    {
> @@ -99,28 +97,28 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>  	      __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
>  	    }
>  	}
> +      else if (status != _HTM_TBEGIN_TRANSIENT)
> +	{
> +	  /* A persistent abort (cc 1 or 3) indicates that a retry is
> +	     probably futile.  Use the normal locking now and for the
> +	     next couple of calls.
> +	     Be careful to avoid writing to the lock.  See above for why
> +	     relaxed MO is sufficient.  */
> +	  if (aconf.skip_lock_internal_abort > 0)
> +	    atomic_store_relaxed (adapt_count,
> +				  aconf.skip_lock_internal_abort);
> +	}
>        else
>  	{
> -	  if (status != _HTM_TBEGIN_TRANSIENT)
> -	    {
> -	      /* A persistent abort (cc 1 or 3) indicates that a retry is
> -		 probably futile.  Use the normal locking now and for the
> -		 next couple of calls.
> -		 Be careful to avoid writing to the lock.  See above for why
> -		 relaxed MO is sufficient.  */
> -	      if (aconf.skip_lock_internal_abort > 0)
> -		atomic_store_relaxed (adapt_count,
> -				      aconf.skip_lock_internal_abort);
> -	      goto use_lock;
> -	    }
> +	  /* Same logic as above, but for for a number of temporary failures in
> +	     a row.  */

for for

> +	  if (aconf.skip_lock_out_of_tbegin_retries > 0)
> +	    atomic_store_relaxed (adapt_count,
> +				  aconf.skip_lock_out_of_tbegin_retries);
>  	}
>      }
>  
> -  /* Same logic as above, but for for a number of temporary failures in a
> -     row.  See above for why relaxed MO is sufficient.  */
> -  if (aconf.skip_lock_out_of_tbegin_retries > 0 && aconf.try_tbegin > 0)
> -    atomic_store_relaxed (adapt_count, aconf.skip_lock_out_of_tbegin_retries);
> -
>    use_lock:
> +  /* Use normal locking as fallback path if transaction does not succeed.  */

the transaction

>    return LLL_LOCK ((*futex), private);
>  }


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock.
  2016-12-06 13:52 ` [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock Stefan Liebler
@ 2017-01-11 10:53   ` Torvald Riegel
  2017-01-17 15:28     ` Stefan Liebler
  0 siblings, 1 reply; 35+ messages in thread
From: Torvald Riegel @ 2017-01-11 10:53 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
> This patch decrements the adapt_count while unlocking the futex
> instead of before aquiring the futex as it is done on power, too.
> Furthermore a transaction is only started if the futex is currently free.
> This check is done after starting the transaction, too.
> If the futex is not free and the transaction nesting depth is one,
> we can simply end the started transaction instead of aborting it.
> The implementation of this check was faulty as it always ended the
> started transaction.  By using the fallback path, the the outermost
> transaction was aborted.  Now the outermost transaction is aborted
> directly.
> 
> This patch also adds some commentary and aligns the code in
> elision-trylock.c to the code in elision-lock.c as possible.

I don't think this is quite ready yet.  See below for details.

I'm not too concerned about this fact, given that it's just in
s390-specific code.  But generally, I'd prefer if arch-specific code
aims for the same quality and level of consensus about it as what is our
aim for generic code.

> ChangeLog:
> 
> 	* sysdeps/unix/sysv/linux/s390/lowlevellock.h
> 	(__lll_unlock_elision, lll_unlock_elision): Add adapt_count argument.
> 	* sysdeps/unix/sysv/linux/s390/elision-lock.c:
> 	(__lll_lock_elision): Decrement adapt_count while unlocking
> 	instead of before locking.
> 	* sysdeps/unix/sysv/linux/s390/elision-trylock.c
> 	(__lll_trylock_elision): Likewise.
> 	* sysdeps/unix/sysv/linux/s390/elision-unlock.c:
> 	(__lll_unlock_elision): Likewise.
> ---
>  sysdeps/unix/sysv/linux/s390/elision-lock.c    | 37 ++++++++-------
>  sysdeps/unix/sysv/linux/s390/elision-trylock.c | 62 ++++++++++++++------------
>  sysdeps/unix/sysv/linux/s390/elision-unlock.c  | 29 ++++++++++--
>  sysdeps/unix/sysv/linux/s390/lowlevellock.h    |  4 +-
>  4 files changed, 78 insertions(+), 54 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> index 3dd7fbc..4a7d546 100644
> --- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
> +++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> @@ -50,31 +50,30 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>       critical section uses lock elision) and outside of transactions.  Thus,
>       we need to use atomic accesses to avoid data races.  However, the
>       value of adapt_count is just a hint, so relaxed MO accesses are
> -     sufficient.  */
> -  if (atomic_load_relaxed (adapt_count) > 0)
> -    {
> -      /* Lost updates are possible, but harmless.  Due to races this might lead
> -	 to *adapt_count becoming less than zero.  */
> -      atomic_store_relaxed (adapt_count,
> -			    atomic_load_relaxed (adapt_count) - 1);
> -      goto use_lock;
> -    }
> -
> -  if (aconf.try_tbegin > 0)
> +     sufficient.
> +     Do not begin a transaction if another cpu has locked the
> +     futex with normal locking.  If adapt_count is zero, it remains and the
> +     next pthread_mutex_lock call will try to start a transaction again.  */

This seems to make an assumption about performance that should be
explained in the comment.  IIRC, x86 LE does not make this assumption,
so it's not generally true.  I suppose s390 aborts are really expensive,
and you don't expect that a lock is in the acquired state often enough
so that aborts are overall more costly than the overhead of the
additional load and branch?

> +    if (atomic_load_relaxed (futex) == 0
> +	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
>      {
>        int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);
>        if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
>  			    _HTM_TBEGIN_STARTED))
>  	{
> -	  if (__builtin_expect (*futex == 0, 1))
> +	  /* Check the futex to make sure nobody has touched it in the
> +	     mean time.  This forces the futex into the cache and makes
> +	     sure the transaction aborts if some other cpu uses the
> +	     lock (writes the futex).  */

s/cpu/CPU/

I'd also say "if another thread acquires the lock concurrently" instead
of the last part of that sentence.

> +	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
>  	    /* Lock was free.  Return to user code in a transaction.  */
>  	    return 0;
>  
>  	  /* Lock was busy.  Fall back to normal locking.  */
> -	  if (__builtin_expect (__libc_tx_nesting_depth (), 1))
> +	  if (__builtin_expect (__libc_tx_nesting_depth () <= 1, 1))

Use __glibc_likely?

>  	    {
>  	      /* In a non-nested transaction there is no need to abort,
> -		 which is expensive.  */
> +		 which is expensive.  Simply end the started transaction.  */
>  	      __libc_tend ();
>  	      /* Don't try to use transactions for the next couple of times.
>  		 See above for why relaxed MO is sufficient.  */
> @@ -92,9 +91,9 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>  		 is zero.
>  		 The adapt_count of this inner mutex is not changed,
>  		 because using the default lock with the inner mutex
> -		 would abort the outer transaction.
> -	      */
> +		 would abort the outer transaction.  */
>  	      __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
> +	      __builtin_unreachable ();
>  	    }
>  	}
>        else if (status != _HTM_TBEGIN_TRANSIENT)
> @@ -110,15 +109,15 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>  	}
>        else
>  	{
> -	  /* Same logic as above, but for for a number of temporary failures in
> -	     a row.  */
> +	  /* The transaction failed for some retries with
> +	     _HTM_TBEGIN_TRANSIENT.  Use the normal locking now and for the
> +	     next couple of calls.  */
>  	  if (aconf.skip_lock_out_of_tbegin_retries > 0)
>  	    atomic_store_relaxed (adapt_count,
>  				  aconf.skip_lock_out_of_tbegin_retries);
>  	}
>      }
>  
> -  use_lock:
>    /* Use normal locking as fallback path if transaction does not succeed.  */
>    return LLL_LOCK ((*futex), private);
>  }
> diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
> index e21fc26..dee66d4 100644
> --- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
> +++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
> @@ -43,23 +43,36 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>  	 until their try_tbegin is zero.
>        */
>        __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
> +      __builtin_unreachable ();
>      }
>  
> -  /* Only try a transaction if it's worth it.  See __lll_lock_elision for
> -     why we need atomic accesses.  Relaxed MO is sufficient because this is
> -     just a hint.  */
> -  if (atomic_load_relaxed (adapt_count) <= 0)
> +  /* adapt_count can be accessed concurrently; these accesses can be both
> +     inside of transactions (if critical sections are nested and the outer
> +     critical section uses lock elision) and outside of transactions.  Thus,
> +     we need to use atomic accesses to avoid data races.  However, the
> +     value of adapt_count is just a hint, so relaxed MO accesses are
> +     sufficient.
> +     Do not begin a transaction if another cpu has locked the
> +     futex with normal locking.  If adapt_count is zero, it remains and the
> +     next pthread_mutex_lock call will try to start a transaction again.  */
> +    if (atomic_load_relaxed (futex) == 0
> +	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
>      {
> -      int status;
> -
> -      if (__builtin_expect
> -	  ((status = __libc_tbegin ((void *) 0)) == _HTM_TBEGIN_STARTED, 1))
> +      int status = __libc_tbegin ((void *) 0);
> +      if (__builtin_expect (status  == _HTM_TBEGIN_STARTED,
> +			    _HTM_TBEGIN_STARTED))
>  	{
> -	  if (*futex == 0)
> +	  /* Check the futex to make sure nobody has touched it in the
> +	     mean time.  This forces the futex into the cache and makes
> +	     sure the transaction aborts if some other cpu uses the
> +	     lock (writes the futex).  */
> +	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))

__glibc_likely

> +	    /* Lock was free.  Return to user code in a transaction.  */
>  	    return 0;
> -	  /* Lock was busy.  Fall back to normal locking.  */
> -	  /* Since we are in a non-nested transaction there is no need to abort,
> -	     which is expensive.  */
> +
> +	  /* Lock was busy.  Fall back to normal locking.  Since we are in
> +	     a non-nested transaction there is no need to abort, which is
> +	     expensive.  Simply end the started transaction.  */
>  	  __libc_tend ();
>  	  /* Note: Changing the adapt_count here might abort a transaction on a
>  	     different cpu, but that could happen anyway when the futex is
> @@ -68,27 +81,18 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>  	  if (aconf.skip_lock_busy > 0)
>  	    atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
>  	}
> -      else
> +      else if (status != _HTM_TBEGIN_TRANSIENT)
>  	{
> -	  if (status != _HTM_TBEGIN_TRANSIENT)
> -	    {
> -	      /* A persistent abort (cc 1 or 3) indicates that a retry is
> -		 probably futile.  Use the normal locking now and for the
> -		 next couple of calls.
> -		 Be careful to avoid writing to the lock.  */
> -	      if (aconf.skip_trylock_internal_abort > 0)
> -		*adapt_count = aconf.skip_trylock_internal_abort;
> -	    }
> +	  /* A persistent abort (cc 1 or 3) indicates that a retry is
> +	     probably futile.  Use the normal locking now and for the
> +	     next couple of calls.
> +	     Be careful to avoid writing to the lock.  */
> +	  if (aconf.skip_trylock_internal_abort > 0)
> +	    *adapt_count = aconf.skip_trylock_internal_abort;
>  	}
>        /* Could do some retries here.  */
>      }
> -  else
> -    {
> -      /* Lost updates are possible, but harmless.  Due to races this might lead
> -	 to *adapt_count becoming less than zero.  */
> -      atomic_store_relaxed (adapt_count,
> -			    atomic_load_relaxed (adapt_count) - 1);
> -    }
>  
> +  /* Use normal locking as fallback path if transaction does not succeed.  */
>    return lll_trylock (*futex);
>  }
> diff --git a/sysdeps/unix/sysv/linux/s390/elision-unlock.c b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
> index 0b1ade9..e68d970 100644
> --- a/sysdeps/unix/sysv/linux/s390/elision-unlock.c
> +++ b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
> @@ -21,16 +21,37 @@
>  #include <htm.h>
>  
>  int
> -__lll_unlock_elision(int *futex, int private)
> +__lll_unlock_elision(int *futex, short *adapt_count, int private)
>  {
>    /* If the lock is free, we elided the lock earlier.  This does not
>       necessarily mean that we are in a transaction, because the user code may
> -     have closed the transaction, but that is impossible to detect reliably.  */
> -  if (*futex == 0)
> +     have closed the transaction, but that is impossible to detect reliably.
> +     Relaxed MO access to futex is sufficient as we only need a hint, if we

This comment is incorrect because we don't just need just a hint here.
The reason why relaxed MO is sufficient is because a correct program
will only release a lock it has acquired; therefore, it must either
changed the futex word's value to something !=0 or it must have used
elision; these are actions by the same thread, so these actions are
sequenced-before the relaxed load (and thus also happens-before the
relaxed load).  Therefore, relaxed MO is sufficient.

> +     started a transaction or acquired the futex in e.g. elision-lock.c.  */
> +  if (atomic_load_relaxed (futex) == 0)
>      {
>        __libc_tend ();
>      }
>    else
> -    lll_unlock ((*futex), private);
> +    {
> +      /* Update the adapt_count while unlocking before completing the critical
> +	 section.  adapt_count is accessed concurrently outside of a
> +	 transaction or an aquired lock e.g. in elision-lock.c so we need to use

transaction or a critical section (e.g., in elision-lock.c); so, we need
to use

> +	 atomic accesses.  However, the value of adapt_count is just a hint, so
> +	 relaxed MO accesses are sufficient.
> +	 If adapt_count would be decremented while locking, multiple
> +	 CPUs trying to lock the locked mutex will decrement adapt_count to
> +	 zero and another CPU will try to start a transaction, which will be
> +	 immediately aborted as the mutex is locked.

I don't think this is necessarily the case.  It is true that if more
than one thread decrements, only one would immediately try to use
elision (because only one decrements from 1 (ignoring lost updates)).
However, if you decrement in the critical section, and lock acquisitions
wait until the lock is free *before* loading adapt_count and choosing
whether to use elision or not, then it shouldn't matter whether you
decrement closer to the lock acquisition or closer to the release.

I think this needs a more thorough analysis (including better
documentation) and/or a microbenchmark.

> +	 If adapt_count would be decremented while unlocking after completing
> +	 the critical section, possible waiters will be waked up before
> +	 decrementing the adapt_count.  Those waked up waiters could have
> +	 destroyed and freed this mutex!  */

Please fix this sentence.  Or you could just say that POSIX' mutex
destruction requirements disallow accesses to the mutexes after it has
been released and thus could have been acquired by another thread.

> +      short adapt_count_val = atomic_load_relaxed (adapt_count);
> +      if (adapt_count_val > 0)
> +	atomic_store_relaxed (adapt_count, adapt_count_val - 1);
> +
> +      lll_unlock ((*futex), private);
> +    }
>    return 0;
>  }
> diff --git a/sysdeps/unix/sysv/linux/s390/lowlevellock.h b/sysdeps/unix/sysv/linux/s390/lowlevellock.h
> index 8d564ed..c60f4f7 100644
> --- a/sysdeps/unix/sysv/linux/s390/lowlevellock.h
> +++ b/sysdeps/unix/sysv/linux/s390/lowlevellock.h
> @@ -50,7 +50,7 @@ extern int __lll_timedlock_elision
>  extern int __lll_lock_elision (int *futex, short *adapt_count, int private)
>    attribute_hidden;
>  
> -extern int __lll_unlock_elision(int *futex, int private)
> +extern int __lll_unlock_elision(int *futex, short *adapt_count, int private)
>    attribute_hidden;
>  
>  extern int __lll_trylock_elision(int *futex, short *adapt_count)
> @@ -59,7 +59,7 @@ extern int __lll_trylock_elision(int *futex, short *adapt_count)
>  # define lll_lock_elision(futex, adapt_count, private) \
>    __lll_lock_elision (&(futex), &(adapt_count), private)
>  #  define lll_unlock_elision(futex, adapt_count, private) \
> -  __lll_unlock_elision (&(futex), private)
> +  __lll_unlock_elision (&(futex), &(adapt_count), private)
>  #  define lll_trylock_elision(futex, adapt_count) \
>    __lll_trylock_elision(&(futex), &(adapt_count))
>  #endif  /* ENABLE_LOCK_ELISION */



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
  2016-12-22  9:34       ` Torvald Riegel
  2017-01-10 11:34         ` Torvald Riegel
@ 2017-01-11 16:06         ` Stefan Liebler
  2017-01-12  9:43           ` Torvald Riegel
  1 sibling, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2017-01-11 16:06 UTC (permalink / raw)
  To: libc-alpha

On 12/22/2016 10:34 AM, Torvald Riegel wrote:
> On Tue, 2016-12-20 at 15:28 +0100, Stefan Liebler wrote:
>> On 12/16/2016 07:14 PM, Torvald Riegel wrote:
>>> On Tue, 2016-12-13 at 09:38 +0100, Stefan Liebler wrote:
>>>> ping.
>>>
>>> Sorry, I've been busy with the robust mutex bugs.
>>>
>>> This patch is OK.  Thanks.
>>>
>> Thanks. Committed.
>
> I've reviewed and ack'ed patch 1 of 4, but it looks like you committed
> patches 2-4 as well.  Did anyone review these?
>
Sorry. That was my fault.
Shall I revert the other patches and continue after the release?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code.
  2017-01-11 16:06         ` Stefan Liebler
@ 2017-01-12  9:43           ` Torvald Riegel
  0 siblings, 0 replies; 35+ messages in thread
From: Torvald Riegel @ 2017-01-12  9:43 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Wed, 2017-01-11 at 17:06 +0100, Stefan Liebler wrote:
> On 12/22/2016 10:34 AM, Torvald Riegel wrote:
> > On Tue, 2016-12-20 at 15:28 +0100, Stefan Liebler wrote:
> >> On 12/16/2016 07:14 PM, Torvald Riegel wrote:
> >>> On Tue, 2016-12-13 at 09:38 +0100, Stefan Liebler wrote:
> >>>> ping.
> >>>
> >>> Sorry, I've been busy with the robust mutex bugs.
> >>>
> >>> This patch is OK.  Thanks.
> >>>
> >> Thanks. Committed.
> >
> > I've reviewed and ack'ed patch 1 of 4, but it looks like you committed
> > patches 2-4 as well.  Did anyone review these?
> >
> Sorry. That was my fault.
> Shall I revert the other patches and continue after the release?
> 

Let's do the normal review process first.  I've reviewed the other 3 by
now; maybe we can just fix them up where necessary.  Sounds good?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-10 16:34   ` Torvald Riegel
@ 2017-01-12 15:45     ` Florian Weimer
  2017-01-13 11:28       ` Torvald Riegel
  2017-01-17 15:28       ` Stefan Liebler
  2017-01-17 15:28     ` Stefan Liebler
  1 sibling, 2 replies; 35+ messages in thread
From: Florian Weimer @ 2017-01-12 15:45 UTC (permalink / raw)
  To: Torvald Riegel, Stefan Liebler; +Cc: libc-alpha

On 01/10/2017 05:34 PM, Torvald Riegel wrote:

> (2) This introduces a facility to probe memory for being accessible or
> not, considering that you say it masks segfaults.  It seems that this
> probing may not be visible to the same extent as possible if a signal
> handler were installed.  Is this relevant from a security perspective?

If the fallback implementation has essentially the same behavior, I 
don't think there is a transaction-specific security problem.

One thing to check is if anything in the transaction memory code writes 
unprotected function pointers/code addresses to memory.  I'm not 
familiar with z Systems machine code, so I don't know if that's the case.

For example, it would be problematic to store the address of the 
transaction abort handler in a TLS variable.

>> +			   /* Begin transaction: save all gprs, allow	\
>> +			      ar modification and fp operations.  Some	\
>> +			      program-interruptions (e.g. a null	\
>> +			      pointer access) are filtered and the	\
>> +			      trancsaction will abort.  In this case	\

Typo: â€œtransactionâ€

Thanks,
Florian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-12 15:45     ` Florian Weimer
@ 2017-01-13 11:28       ` Torvald Riegel
  2017-01-13 14:50         ` Florian Weimer
  2017-01-17 15:28       ` Stefan Liebler
  1 sibling, 1 reply; 35+ messages in thread
From: Torvald Riegel @ 2017-01-13 11:28 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Stefan Liebler, libc-alpha

On Thu, 2017-01-12 at 16:45 +0100, Florian Weimer wrote:
> On 01/10/2017 05:34 PM, Torvald Riegel wrote:
> 
> > (2) This introduces a facility to probe memory for being accessible or
> > not, considering that you say it masks segfaults.  It seems that this
> > probing may not be visible to the same extent as possible if a signal
> > handler were installed.  Is this relevant from a security perspective?
> 
> If the fallback implementation has essentially the same behavior, I 
> don't think there is a transaction-specific security problem.

We don't know what the fallback implementation in the user code does.
It can detect whether it is running in a HW transaction and run
different code depending on that.

There are different approaches to what HTMs do when transactions run
into segfaults and the like.  IIRC, Intel masks them all, so
transactions aborts before the segfault "materializes".  AMD's old ASF
proposal did not mask segfaults (or normal faults).

I'm not quite sure whether the amount of probing that you could do with
this patch on s390 would be substantially different than what would be
possible on Intel's RTM (or our lock elision implementation for TSX).

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-13 11:28       ` Torvald Riegel
@ 2017-01-13 14:50         ` Florian Weimer
  2017-01-13 15:32           ` Torvald Riegel
  0 siblings, 1 reply; 35+ messages in thread
From: Florian Weimer @ 2017-01-13 14:50 UTC (permalink / raw)
  To: Torvald Riegel; +Cc: Stefan Liebler, libc-alpha

On 01/13/2017 12:28 PM, Torvald Riegel wrote:
> On Thu, 2017-01-12 at 16:45 +0100, Florian Weimer wrote:
>> On 01/10/2017 05:34 PM, Torvald Riegel wrote:
>>
>>> (2) This introduces a facility to probe memory for being accessible or
>>> not, considering that you say it masks segfaults.  It seems that this
>>> probing may not be visible to the same extent as possible if a signal
>>> handler were installed.  Is this relevant from a security perspective?
>>
>> If the fallback implementation has essentially the same behavior, I
>> don't think there is a transaction-specific security problem.
>
> We don't know what the fallback implementation in the user code does.
> It can detect whether it is running in a HW transaction and run
> different code depending on that.

Is this really supported?  I assumed that if you acquire a lock (which 
might use elision), hardware transactions are off limits because glibc 
might require their use internally.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-13 14:50         ` Florian Weimer
@ 2017-01-13 15:32           ` Torvald Riegel
  2017-01-13 15:34             ` Florian Weimer
  2017-01-13 16:22             ` Stefan Liebler
  0 siblings, 2 replies; 35+ messages in thread
From: Torvald Riegel @ 2017-01-13 15:32 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Stefan Liebler, libc-alpha

On Fri, 2017-01-13 at 15:50 +0100, Florian Weimer wrote:
> On 01/13/2017 12:28 PM, Torvald Riegel wrote:
> > On Thu, 2017-01-12 at 16:45 +0100, Florian Weimer wrote:
> >> On 01/10/2017 05:34 PM, Torvald Riegel wrote:
> >>
> >>> (2) This introduces a facility to probe memory for being accessible or
> >>> not, considering that you say it masks segfaults.  It seems that this
> >>> probing may not be visible to the same extent as possible if a signal
> >>> handler were installed.  Is this relevant from a security perspective?
> >>
> >> If the fallback implementation has essentially the same behavior, I
> >> don't think there is a transaction-specific security problem.
> >
> > We don't know what the fallback implementation in the user code does.
> > It can detect whether it is running in a HW transaction and run
> > different code depending on that.
> 
> Is this really supported?  I assumed that if you acquire a lock (which 
> might use elision), hardware transactions are off limits because glibc 
> might require their use internally.

You can check whether you run in a transaction.  That's no guarantee
that that transaction has been started by the elision code, but you can
check that nonetheless.

HW transactions must be usable regardless of whether something else uses
elision or not.  One of course shouldn't just commit transactions that
one did not start, though.

Anyway, I'm not sure whether this gives user code more means to exploit
or probe something, or not.  Maybe I was just too cautious here.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-13 15:32           ` Torvald Riegel
@ 2017-01-13 15:34             ` Florian Weimer
  2017-01-13 16:22             ` Stefan Liebler
  1 sibling, 0 replies; 35+ messages in thread
From: Florian Weimer @ 2017-01-13 15:34 UTC (permalink / raw)
  To: Torvald Riegel; +Cc: Stefan Liebler, libc-alpha

On 01/13/2017 04:32 PM, Torvald Riegel wrote:

> Anyway, I'm not sure whether this gives user code more means to exploit
> or probe something, or not.  Maybe I was just too cautious here.

Well, on a certain level, it's no worse than fork. :)

I'm not too worried about it (even though fork is of course abused in 
this way, it used to be the main reason why Windows and Linux exploit 
development were so different).

Florian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-13 15:32           ` Torvald Riegel
  2017-01-13 15:34             ` Florian Weimer
@ 2017-01-13 16:22             ` Stefan Liebler
  2017-01-13 22:22               ` Torvald Riegel
  1 sibling, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2017-01-13 16:22 UTC (permalink / raw)
  To: libc-alpha

On 01/13/2017 04:32 PM, Torvald Riegel wrote:
> On Fri, 2017-01-13 at 15:50 +0100, Florian Weimer wrote:
>> On 01/13/2017 12:28 PM, Torvald Riegel wrote:
>>> On Thu, 2017-01-12 at 16:45 +0100, Florian Weimer wrote:
>>>> On 01/10/2017 05:34 PM, Torvald Riegel wrote:
>>>>
>>>>> (2) This introduces a facility to probe memory for being accessible or
>>>>> not, considering that you say it masks segfaults.  It seems that this
>>>>> probing may not be visible to the same extent as possible if a signal
>>>>> handler were installed.  Is this relevant from a security perspective?
>>>>
>>>> If the fallback implementation has essentially the same behavior, I
>>>> don't think there is a transaction-specific security problem.
>>>
>>> We don't know what the fallback implementation in the user code does.
>>> It can detect whether it is running in a HW transaction and run
>>> different code depending on that.
Yes, the user can detect if it is running within a transaction (e.g. 
with the __builtin_tx_nesting_depth like used in elision_trylock).
With non transactional stores, he could also store something even if the 
transaction aborts.
>>
>> Is this really supported?  I assumed that if you acquire a lock (which
>> might use elision), hardware transactions are off limits because glibc
>> might require their use internally.
If someone starts a transaction within a transaction, then the nesting 
depth is incremented. If the outermost transaction is ended, the changes 
are committed.
>
> You can check whether you run in a transaction.  That's no guarantee
> that that transaction has been started by the elision code, but you can
> check that nonetheless.
>
> HW transactions must be usable regardless of whether something else uses
> elision or not.  One of course shouldn't just commit transactions that
> one did not start, though.
You are right, but this is possible.
>
> Anyway, I'm not sure whether this gives user code more means to exploit
> or probe something, or not.  Maybe I was just too cautious here.
>
>
That's correct, you can probe something.
But if someone wants to do that, he can also start a transaction 
directly without pthread_mutex_lock.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-13 16:22             ` Stefan Liebler
@ 2017-01-13 22:22               ` Torvald Riegel
  0 siblings, 0 replies; 35+ messages in thread
From: Torvald Riegel @ 2017-01-13 22:22 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Fri, 2017-01-13 at 17:22 +0100, Stefan Liebler wrote:
> On 01/13/2017 04:32 PM, Torvald Riegel wrote:
> > Anyway, I'm not sure whether this gives user code more means to exploit
> > or probe something, or not.  Maybe I was just too cautious here.
> >
> >
> That's correct, you can probe something.
> But if someone wants to do that, he can also start a transaction 
> directly without pthread_mutex_lock.

True, but it might be easier for an exploit to set up a call to existing
code than to either build binary code that starts transactions or find
suitable binary code elsewhere.



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-10 16:34   ` Torvald Riegel
  2017-01-12 15:45     ` Florian Weimer
@ 2017-01-17 15:28     ` Stefan Liebler
  2017-01-17 17:44       ` Torvald Riegel
  1 sibling, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2017-01-17 15:28 UTC (permalink / raw)
  To: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 1675 bytes --]

On 01/10/2017 05:34 PM, Torvald Riegel wrote:
  > I'm not sure that this is always a good thing to do.  It is nicer for
  > programmers if they can get an instruction pointer that points to where
  > the faulty access happened.  However:
  >
  > (1) This can mask failures present in an application, because the
  > fallback path is not guaranteed to operate on the same data.  It may
  > never run into this problem.  Can inconsistencies arise due to that?
  > For example, is a masked segfault still visible through performance
  > counters or something like that?
No. If an interruption is filtered, those informations are not stored.
  >
  > (2) This introduces a facility to probe memory for being accessible or
  > not, considering that you say it masks segfaults.  It seems that this
  > probing may not be visible to the same extent as possible if a signal
  > handler were installed.  Is this relevant from a security perspective?
  > It may not be given that perhaps the user can do that anyway through
  > direct usage of transactions, or perhaps because the user would have to
  > be able to check whether the program is currently executing in a
  > transaction or not.
  > It might be good to at least mention this in a comment in the code.
I've documented it in the comment.

> On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
>> +/* These builtins are correct.  Use them.  */
>
> Is "correct" the right word here?  I suppose they are always correct,
> it's just that you want something different for glibc internally.
>
I'll correct it.

I've attached the diff here and will later make one patch with changelog 
for this and the other two patches.





[-- Attachment #2: 20170117_htm_2.patch --]
[-- Type: text/x-patch, Size: 1169 bytes --]

diff --git a/sysdeps/unix/sysv/linux/s390/htm.h b/sysdeps/unix/sysv/linux/s390/htm.h
index 32d5a88..3b0ab7f 100644
--- a/sysdeps/unix/sysv/linux/s390/htm.h
+++ b/sysdeps/unix/sysv/linux/s390/htm.h
@@ -123,7 +123,12 @@
 			      the normal lock path will execute it	\
 			      again and result in a core dump wich does	\
 			      now show at tbegin but the real executed	\
-			      instruction.  */				\
+			      instruction.				\
+			      However it is not guaranteed that this	\
+			      retry operate on the same data and thus	\
+			      may not end in an program-interruption.	\
+			      Note: This could also be used to probe	\
+			      memory for being accessible!  */		\
 			   "2: tbegin 0, 0xFF0E\n\t"			\
 			   /* Branch away in abort case (this is the	\
 			      prefered sequence.  See PoP in chapter 5	\
@@ -152,7 +157,8 @@
      __ret;								\
      })
 
-/* These builtins are correct.  Use them.  */
+/* These builtins are usable in context of glibc lock elision code without any
+   changes.  Use them.  */
 #define __libc_tend()							\
   ({ __asm__ __volatile__ (".machine push\n\t"				\
 			   ".machinemode \"zarch_nohighgprs\"\n\t"	\


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-12 15:45     ` Florian Weimer
  2017-01-13 11:28       ` Torvald Riegel
@ 2017-01-17 15:28       ` Stefan Liebler
  1 sibling, 0 replies; 35+ messages in thread
From: Stefan Liebler @ 2017-01-17 15:28 UTC (permalink / raw)
  To: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 1827 bytes --]

On 01/12/2017 04:45 PM, Florian Weimer wrote:
> On 01/10/2017 05:34 PM, Torvald Riegel wrote:
>
>> (2) This introduces a facility to probe memory for being accessible or
>> not, considering that you say it masks segfaults.  It seems that this
>> probing may not be visible to the same extent as possible if a signal
>> handler were installed.  Is this relevant from a security perspective?
>
> If the fallback implementation has essentially the same behavior, I
> don't think there is a transaction-specific security problem.
>
> One thing to check is if anything in the transaction memory code writes
> unprotected function pointers/code addresses to memory.  I'm not
> familiar with z Systems machine code, so I don't know if that's the case.
>
> For example, it would be problematic to store the address of the
> transaction abort handler in a TLS variable.

The first tbegin instruction starts transactional execution mode.
Then the Transaction-Abort PSW is set to the instruction after the
tbegin instruction. If this transaction or multiple nested transactions 
are aborted, the instruction in Transaction-Abort PSW (after the 
outermost tbegin) is executed and the condition code set to 1-3.
The code behind tbegin has to determine what to do next.
There is no instruction to extract Transaction-Abort PSW.
>
>>> +               /* Begin transaction: save all gprs, allow    \
>>> +                  ar modification and fp operations.  Some    \
>>> +                  program-interruptions (e.g. a null    \
>>> +                  pointer access) are filtered and the    \
>>> +                  trancsaction will abort.  In this case    \
>
> Typo: â€œtransactionâ€
okay.
>
> Thanks,
> Florian
>

I've attached the diff here and will later make one patch with changelog 
for this and the other two patches.





[-- Attachment #2: 20170116_htm_1.patch --]
[-- Type: text/x-patch, Size: 626 bytes --]

diff --git a/sysdeps/unix/sysv/linux/s390/htm.h b/sysdeps/unix/sysv/linux/s390/htm.h
index 32d5a88..af7144f 100644
--- a/sysdeps/unix/sysv/linux/s390/htm.h
+++ b/sysdeps/unix/sysv/linux/s390/htm.h
@@ -119,7 +119,7 @@
 			      ar modification and fp operations.  Some	\
 			      program-interruptions (e.g. a null	\
 			      pointer access) are filtered and the	\
-			      trancsaction will abort.  In this case	\
+			      transaction will abort.  In this case	\
 			      the normal lock path will execute it	\
 			      again and result in a core dump wich does	\
 			      now show at tbegin but the real executed	\




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c.
  2017-01-10 16:53   ` Torvald Riegel
@ 2017-01-17 15:28     ` Stefan Liebler
  2017-01-17 17:45       ` Torvald Riegel
  0 siblings, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2017-01-17 15:28 UTC (permalink / raw)
  To: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 4803 bytes --]

On 01/10/2017 05:53 PM, Torvald Riegel wrote:
> On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
>> This patch implements __libc_tbegin_retry macro which is equivalent to
>> gcc builtin __builtin_tbegin_retry, except the changes which were applied
>> to __libc_tbegin in the previous patch.
>>
>> If tbegin aborts with _HTM_TBEGIN_TRANSIENT.  Then this macros restores
>> the fpc, fprs and automatically retries up to retry_cnt tbegins.
>> Further saving of the state is omitted as it is already saved in the
>> first round.  Before retrying a further transaction, the
>> transaction-abort-assist instruction is used to support the cpu.
>
> This looks okay to me on the surface of it, but I don't know anything
> about s390 asm.
>
>> Use __libc_tbegin_retry macro.
>> ---
>>  sysdeps/unix/sysv/linux/s390/elision-lock.c | 50 ++++++++++++++---------------
>>  sysdeps/unix/sysv/linux/s390/htm.h          | 36 +++++++++++++++++++--
>>  2 files changed, 58 insertions(+), 28 deletions(-)
>>
>> diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
>> index 48cc3db..3dd7fbc 100644
>> --- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
>> +++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
>> @@ -60,17 +60,16 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>>        goto use_lock;
>>      }
>>
>> -  int try_tbegin;
>> -  for (try_tbegin = aconf.try_tbegin;
>> -       try_tbegin > 0;
>> -       try_tbegin--)
>> +  if (aconf.try_tbegin > 0)
>>      {
>> -      int status;
>> -      if (__builtin_expect
>> -	  ((status = __libc_tbegin ((void *) 0)) == _HTM_TBEGIN_STARTED, 1))
>> +      int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);
>
> Maybe add a comment that the macro that reminds the reader that the
> macro expects a retry count, not how often a transaction should be
> tried.
>
okay
>> +      if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
>> +			    _HTM_TBEGIN_STARTED))
>>  	{
>> -	  if (*futex == 0)
>> +	  if (__builtin_expect (*futex == 0, 1))
>> +	    /* Lock was free.  Return to user code in a transaction.  */
>>  	    return 0;
>> +
>>  	  /* Lock was busy.  Fall back to normal locking.  */
>>  	  if (__builtin_expect (__libc_tx_nesting_depth (), 1))
>>  	    {
>> @@ -81,7 +80,6 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>>  		 See above for why relaxed MO is sufficient.  */
>>  	      if (aconf.skip_lock_busy > 0)
>>  		atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
>> -	      goto use_lock;
>>  	    }
>>  	  else /* nesting depth is > 1 */
>>  	    {
>> @@ -99,28 +97,28 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>>  	      __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
>>  	    }
>>  	}
>> +      else if (status != _HTM_TBEGIN_TRANSIENT)
>> +	{
>> +	  /* A persistent abort (cc 1 or 3) indicates that a retry is
>> +	     probably futile.  Use the normal locking now and for the
>> +	     next couple of calls.
>> +	     Be careful to avoid writing to the lock.  See above for why
>> +	     relaxed MO is sufficient.  */
>> +	  if (aconf.skip_lock_internal_abort > 0)
>> +	    atomic_store_relaxed (adapt_count,
>> +				  aconf.skip_lock_internal_abort);
>> +	}
>>        else
>>  	{
>> -	  if (status != _HTM_TBEGIN_TRANSIENT)
>> -	    {
>> -	      /* A persistent abort (cc 1 or 3) indicates that a retry is
>> -		 probably futile.  Use the normal locking now and for the
>> -		 next couple of calls.
>> -		 Be careful to avoid writing to the lock.  See above for why
>> -		 relaxed MO is sufficient.  */
>> -	      if (aconf.skip_lock_internal_abort > 0)
>> -		atomic_store_relaxed (adapt_count,
>> -				      aconf.skip_lock_internal_abort);
>> -	      goto use_lock;
>> -	    }
>> +	  /* Same logic as above, but for for a number of temporary failures in
>> +	     a row.  */
>
> for for
>
This comment is rewritten in patch 4/4. Thus it is not changed in 
attached diff.
>> +	  if (aconf.skip_lock_out_of_tbegin_retries > 0)
>> +	    atomic_store_relaxed (adapt_count,
>> +				  aconf.skip_lock_out_of_tbegin_retries);
>>  	}
>>      }
>>
>> -  /* Same logic as above, but for for a number of temporary failures in a
>> -     row.  See above for why relaxed MO is sufficient.  */
>> -  if (aconf.skip_lock_out_of_tbegin_retries > 0 && aconf.try_tbegin > 0)
>> -    atomic_store_relaxed (adapt_count, aconf.skip_lock_out_of_tbegin_retries);
>> -
>>    use_lock:
>> +  /* Use normal locking as fallback path if transaction does not succeed.  */
>
> the transaction
>
okay. Changed in elision-trylock.c, too.
>>    return LLL_LOCK ((*futex), private);
>>  }
>
>

I've attached the diff here and will later make one patch with changelog 
for this and the other two patches.



[-- Attachment #2: 20170117_3.patch --]
[-- Type: text/x-patch, Size: 1711 bytes --]

diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
index a7c0fcc..faa998e 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
@@ -57,6 +57,9 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
     if (atomic_load_relaxed (futex) == 0
 	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
     {
+      /* Start a transaction and retry it automatically if it aborts with
+	 _HTM_TBEGIN_TRANSIENT.  This macro calls tbegin at most retry_cnt
+	 + 1 times.  The second argument is considered as retry_cnt.  */
       int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);
       if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
 			    _HTM_TBEGIN_STARTED))
@@ -118,6 +121,7 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 	}
     }
 
-  /* Use normal locking as fallback path if transaction does not succeed.  */
+  /* Use normal locking as fallback path if the transaction does not
+     succeed.  */
   return LLL_LOCK ((*futex), private);
 }
diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
index 3c1d009..eec172a 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
@@ -93,6 +93,7 @@ __lll_trylock_elision (int *futex, short *adapt_count)
       /* Could do some retries here.  */
     }
 
-  /* Use normal locking as fallback path if transaction does not succeed.  */
+  /* Use normal locking as fallback path if the transaction does not
+     succeed.  */
   return lll_trylock (*futex);
 }


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock.
  2017-01-11 10:53   ` Torvald Riegel
@ 2017-01-17 15:28     ` Stefan Liebler
  2017-01-17 18:53       ` Torvald Riegel
  0 siblings, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2017-01-17 15:28 UTC (permalink / raw)
  To: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 17885 bytes --]

On 01/11/2017 11:53 AM, Torvald Riegel wrote:
> On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
>> This patch decrements the adapt_count while unlocking the futex
>> instead of before aquiring the futex as it is done on power, too.
>> Furthermore a transaction is only started if the futex is currently free.
>> This check is done after starting the transaction, too.
>> If the futex is not free and the transaction nesting depth is one,
>> we can simply end the started transaction instead of aborting it.
>> The implementation of this check was faulty as it always ended the
>> started transaction.  By using the fallback path, the the outermost
>> transaction was aborted.  Now the outermost transaction is aborted
>> directly.
>>
>> This patch also adds some commentary and aligns the code in
>> elision-trylock.c to the code in elision-lock.c as possible.
>
> I don't think this is quite ready yet.  See below for details.
>
> I'm not too concerned about this fact, given that it's just in
> s390-specific code.  But generally, I'd prefer if arch-specific code
> aims for the same quality and level of consensus about it as what is our
> aim for generic code.
>
>> ChangeLog:
>>
>> 	* sysdeps/unix/sysv/linux/s390/lowlevellock.h
>> 	(__lll_unlock_elision, lll_unlock_elision): Add adapt_count argument.
>> 	* sysdeps/unix/sysv/linux/s390/elision-lock.c:
>> 	(__lll_lock_elision): Decrement adapt_count while unlocking
>> 	instead of before locking.
>> 	* sysdeps/unix/sysv/linux/s390/elision-trylock.c
>> 	(__lll_trylock_elision): Likewise.
>> 	* sysdeps/unix/sysv/linux/s390/elision-unlock.c:
>> 	(__lll_unlock_elision): Likewise.
>> ---
>>  sysdeps/unix/sysv/linux/s390/elision-lock.c    | 37 ++++++++-------
>>  sysdeps/unix/sysv/linux/s390/elision-trylock.c | 62 ++++++++++++++------------
>>  sysdeps/unix/sysv/linux/s390/elision-unlock.c  | 29 ++++++++++--
>>  sysdeps/unix/sysv/linux/s390/lowlevellock.h    |  4 +-
>>  4 files changed, 78 insertions(+), 54 deletions(-)
>>
>> diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
>> index 3dd7fbc..4a7d546 100644
>> --- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
>> +++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
>> @@ -50,31 +50,30 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>>       critical section uses lock elision) and outside of transactions.  Thus,
>>       we need to use atomic accesses to avoid data races.  However, the
>>       value of adapt_count is just a hint, so relaxed MO accesses are
>> -     sufficient.  */
>> -  if (atomic_load_relaxed (adapt_count) > 0)
>> -    {
>> -      /* Lost updates are possible, but harmless.  Due to races this might lead
>> -	 to *adapt_count becoming less than zero.  */
>> -      atomic_store_relaxed (adapt_count,
>> -			    atomic_load_relaxed (adapt_count) - 1);
>> -      goto use_lock;
>> -    }
>> -
>> -  if (aconf.try_tbegin > 0)
>> +     sufficient.
>> +     Do not begin a transaction if another cpu has locked the
>> +     futex with normal locking.  If adapt_count is zero, it remains and the
>> +     next pthread_mutex_lock call will try to start a transaction again.  */
>
> This seems to make an assumption about performance that should be
> explained in the comment.  IIRC, x86 LE does not make this assumption,
> so it's not generally true.  I suppose s390 aborts are really expensive,
> and you don't expect that a lock is in the acquired state often enough
> so that aborts are overall more costly than the overhead of the
> additional load and branch?
>
Yes, aborting a transaction is expensive.
But you are right, there is an additional load and branch and this 
thread will anyway wait in LLL_LOCK for another thread releasing the futex.
See example below.

>> +    if (atomic_load_relaxed (futex) == 0
>> +	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
>>      {
>>        int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);
>>        if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
>>  			    _HTM_TBEGIN_STARTED))
>>  	{
>> -	  if (__builtin_expect (*futex == 0, 1))
>> +	  /* Check the futex to make sure nobody has touched it in the
>> +	     mean time.  This forces the futex into the cache and makes
>> +	     sure the transaction aborts if some other cpu uses the
>> +	     lock (writes the futex).  */
>
> s/cpu/CPU/
I'll correct it in the whole file and in elision-trylock.c.
>
> I'd also say "if another thread acquires the lock concurrently" instead
> of the last part of that sentence.
>
>> +	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
>>  	    /* Lock was free.  Return to user code in a transaction.  */
>>  	    return 0;
>>
>>  	  /* Lock was busy.  Fall back to normal locking.  */
>> -	  if (__builtin_expect (__libc_tx_nesting_depth (), 1))
>> +	  if (__builtin_expect (__libc_tx_nesting_depth () <= 1, 1))
>
> Use __glibc_likely?
okay.
>
>>  	    {
>>  	      /* In a non-nested transaction there is no need to abort,
>> -		 which is expensive.  */
>> +		 which is expensive.  Simply end the started transaction.  */
>>  	      __libc_tend ();
>>  	      /* Don't try to use transactions for the next couple of times.
>>  		 See above for why relaxed MO is sufficient.  */
>> @@ -92,9 +91,9 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>>  		 is zero.
>>  		 The adapt_count of this inner mutex is not changed,
>>  		 because using the default lock with the inner mutex
>> -		 would abort the outer transaction.
>> -	      */
>> +		 would abort the outer transaction.  */
>>  	      __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
>> +	      __builtin_unreachable ();
>>  	    }
>>  	}
>>        else if (status != _HTM_TBEGIN_TRANSIENT)
>> @@ -110,15 +109,15 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>>  	}
>>        else
>>  	{
>> -	  /* Same logic as above, but for for a number of temporary failures in
>> -	     a row.  */
>> +	  /* The transaction failed for some retries with
>> +	     _HTM_TBEGIN_TRANSIENT.  Use the normal locking now and for the
>> +	     next couple of calls.  */
>>  	  if (aconf.skip_lock_out_of_tbegin_retries > 0)
>>  	    atomic_store_relaxed (adapt_count,
>>  				  aconf.skip_lock_out_of_tbegin_retries);
>>  	}
>>      }
>>
>> -  use_lock:
>>    /* Use normal locking as fallback path if transaction does not succeed.  */
>>    return LLL_LOCK ((*futex), private);
>>  }
>> diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
>> index e21fc26..dee66d4 100644
>> --- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
>> +++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
>> @@ -43,23 +43,36 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>>  	 until their try_tbegin is zero.
>>        */
>>        __libc_tabort (_HTM_FIRST_USER_ABORT_CODE | 1);
>> +      __builtin_unreachable ();
>>      }
>>
>> -  /* Only try a transaction if it's worth it.  See __lll_lock_elision for
>> -     why we need atomic accesses.  Relaxed MO is sufficient because this is
>> -     just a hint.  */
>> -  if (atomic_load_relaxed (adapt_count) <= 0)
>> +  /* adapt_count can be accessed concurrently; these accesses can be both
>> +     inside of transactions (if critical sections are nested and the outer
>> +     critical section uses lock elision) and outside of transactions.  Thus,
>> +     we need to use atomic accesses to avoid data races.  However, the
>> +     value of adapt_count is just a hint, so relaxed MO accesses are
>> +     sufficient.
>> +     Do not begin a transaction if another cpu has locked the
>> +     futex with normal locking.  If adapt_count is zero, it remains and the
>> +     next pthread_mutex_lock call will try to start a transaction again.  */
>> +    if (atomic_load_relaxed (futex) == 0
>> +	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
>>      {
>> -      int status;
>> -
>> -      if (__builtin_expect
>> -	  ((status = __libc_tbegin ((void *) 0)) == _HTM_TBEGIN_STARTED, 1))
>> +      int status = __libc_tbegin ((void *) 0);
>> +      if (__builtin_expect (status  == _HTM_TBEGIN_STARTED,
>> +			    _HTM_TBEGIN_STARTED))
>>  	{
>> -	  if (*futex == 0)
>> +	  /* Check the futex to make sure nobody has touched it in the
>> +	     mean time.  This forces the futex into the cache and makes
>> +	     sure the transaction aborts if some other cpu uses the
>> +	     lock (writes the futex).  */
>> +	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
>
> __glibc_likely
okay.
>
>> +	    /* Lock was free.  Return to user code in a transaction.  */
>>  	    return 0;
>> -	  /* Lock was busy.  Fall back to normal locking.  */
>> -	  /* Since we are in a non-nested transaction there is no need to abort,
>> -	     which is expensive.  */
>> +
>> +	  /* Lock was busy.  Fall back to normal locking.  Since we are in
>> +	     a non-nested transaction there is no need to abort, which is
>> +	     expensive.  Simply end the started transaction.  */
>>  	  __libc_tend ();
>>  	  /* Note: Changing the adapt_count here might abort a transaction on a
>>  	     different cpu, but that could happen anyway when the futex is
>> @@ -68,27 +81,18 @@ __lll_trylock_elision (int *futex, short *adapt_count)
>>  	  if (aconf.skip_lock_busy > 0)
>>  	    atomic_store_relaxed (adapt_count, aconf.skip_lock_busy);
>>  	}
>> -      else
>> +      else if (status != _HTM_TBEGIN_TRANSIENT)
>>  	{
>> -	  if (status != _HTM_TBEGIN_TRANSIENT)
>> -	    {
>> -	      /* A persistent abort (cc 1 or 3) indicates that a retry is
>> -		 probably futile.  Use the normal locking now and for the
>> -		 next couple of calls.
>> -		 Be careful to avoid writing to the lock.  */
>> -	      if (aconf.skip_trylock_internal_abort > 0)
>> -		*adapt_count = aconf.skip_trylock_internal_abort;
>> -	    }
>> +	  /* A persistent abort (cc 1 or 3) indicates that a retry is
>> +	     probably futile.  Use the normal locking now and for the
>> +	     next couple of calls.
>> +	     Be careful to avoid writing to the lock.  */
>> +	  if (aconf.skip_trylock_internal_abort > 0)
>> +	    *adapt_count = aconf.skip_trylock_internal_abort;
>>  	}
>>        /* Could do some retries here.  */
>>      }
>> -  else
>> -    {
>> -      /* Lost updates are possible, but harmless.  Due to races this might lead
>> -	 to *adapt_count becoming less than zero.  */
>> -      atomic_store_relaxed (adapt_count,
>> -			    atomic_load_relaxed (adapt_count) - 1);
>> -    }
>>
>> +  /* Use normal locking as fallback path if transaction does not succeed.  */
>>    return lll_trylock (*futex);
>>  }
>> diff --git a/sysdeps/unix/sysv/linux/s390/elision-unlock.c b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
>> index 0b1ade9..e68d970 100644
>> --- a/sysdeps/unix/sysv/linux/s390/elision-unlock.c
>> +++ b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
>> @@ -21,16 +21,37 @@
>>  #include <htm.h>
>>
>>  int
>> -__lll_unlock_elision(int *futex, int private)
>> +__lll_unlock_elision(int *futex, short *adapt_count, int private)
>>  {
>>    /* If the lock is free, we elided the lock earlier.  This does not
>>       necessarily mean that we are in a transaction, because the user code may
>> -     have closed the transaction, but that is impossible to detect reliably.  */
>> -  if (*futex == 0)
>> +     have closed the transaction, but that is impossible to detect reliably.
>> +     Relaxed MO access to futex is sufficient as we only need a hint, if we
>
> This comment is incorrect because we don't just need just a hint here.
> The reason why relaxed MO is sufficient is because a correct program
> will only release a lock it has acquired; therefore, it must either
> changed the futex word's value to something !=0 or it must have used
> elision; these are actions by the same thread, so these actions are
> sequenced-before the relaxed load (and thus also happens-before the
> relaxed load).  Therefore, relaxed MO is sufficient.
>
okay.
>> +     started a transaction or acquired the futex in e.g. elision-lock.c.  */
>> +  if (atomic_load_relaxed (futex) == 0)
>>      {
>>        __libc_tend ();
>>      }
>>    else
>> -    lll_unlock ((*futex), private);
>> +    {
>> +      /* Update the adapt_count while unlocking before completing the critical
>> +	 section.  adapt_count is accessed concurrently outside of a
>> +	 transaction or an aquired lock e.g. in elision-lock.c so we need to use
>
> transaction or a critical section (e.g., in elision-lock.c); so, we need
> to use
>
okay

>> +	 atomic accesses.  However, the value of adapt_count is just a hint, so
>> +	 relaxed MO accesses are sufficient.
>> +	 If adapt_count would be decremented while locking, multiple
>> +	 CPUs trying to lock the locked mutex will decrement adapt_count to
>> +	 zero and another CPU will try to start a transaction, which will be
>> +	 immediately aborted as the mutex is locked.
>
> I don't think this is necessarily the case.  It is true that if more
> than one thread decrements, only one would immediately try to use
> elision (because only one decrements from 1 (ignoring lost updates)).

> However, if you decrement in the critical section, and lock acquisitions
> wait until the lock is free *before* loading adapt_count and choosing
> whether to use elision or not, then it shouldn't matter whether you
> decrement closer to the lock acquisition or closer to the release.
Waiting for a free lock is done by futex-syscall within LLL_LOCK (see 
__lll_lock_wait/__lll_lock_wait_private).
On wakeup the lock is immediately acquired if it is free.
Afterwards there is no loading of adapt_count and no decision whether to 
use elision or not.
Following your suggestion means, that we need a further implementation 
like in __lll_lock_wait/__lll_lock_wait_private in order to wait for a 
free lock and then load adapt_count and choose to elide or not!


Please have a look at the following example assuming:
adapt_count = 1;
There are two scenarios below: Imagine that only "Thread 3a" or "Thread 
3b" is used.

Decrement adapt_count while locking (without this patch):
-Thread 1 __lll_lock_elision:
decrements adapt_count to 0 and acquires the lock via LLL_LOCK.
-Thread 2 __lll_lock_elision:
starts a transaction and ends / aborts it immediately as lock is 
acquired. adapt_count is set to 3. LLL_LOCK waits until lock is released 
by Thread 1.
-Thread 1 __lll_unlock_elision:
releases lock.
-Thread 2 __lll_lock_elision:
wakes up and aquires lock via waiting LLL_LOCK.
-Thread 3a __lll_lock_elision:
decrements adapt_count to 2 and waits via LLL_LOCK until lock is 
released by Thread 2.
-Thread 2 __lll_unlock_elision:
releases lock.
-Thread 3b __lll_lock_elision:
decrements adapt_count to 2 and acquires lock via LLL_LOCK.

Decrement adapt_count while unlocking (with this patch):
-Thread 1 __lll_lock_elision:
acquires the lock via LLL_LOCK. adapt_count remains 1.
-Thread 2 __lll_lock_elision:
LLL_LOCK is used as futex is acquired by Thread 1 or adapt_count > 0. It 
waits until lock is released by Thread 1.
-Thread 1 __lll_unlock_elision:
decrements adapt_count to 0 and releases the lock.
-Thread 2 __lll_lock_elision:
wakes up and acquires lock via waiting LLL_LOCK.
-Thread 3a __lll_lock_elision:
LLL_LOCK is used as futex is acquired by Thread 2.
-Thread 2 __lll_unlock_elision:
releases lock. adapt_count remains 0.
-Thread 3b __lll_lock_elision:
starts a transaction.

If futex is not tested before starting a transaction,
the additional load and branch is not needed, Thread 3a will start and 
abort a transaction, set adapt_count to 3 and will wait via LLL_LOCK.
In case Thread 3b, a transaction will be started.

The attached diff removes the futex==0 test.
Later I will make one patch with changelog for this and the other two 
patches.
>
> I think this needs a more thorough analysis (including better
> documentation) and/or a microbenchmark.
>
>> +	 If adapt_count would be decremented while unlocking after completing
>> +	 the critical section, possible waiters will be waked up before
>> +	 decrementing the adapt_count.  Those waked up waiters could have
>> +	 destroyed and freed this mutex!  */
>
> Please fix this sentence.  Or you could just say that POSIX' mutex
> destruction requirements disallow accesses to the mutexes after it has
> been released and thus could have been acquired by another thread.
>
Okay.
>> +      short adapt_count_val = atomic_load_relaxed (adapt_count);
>> +      if (adapt_count_val > 0)
>> +	atomic_store_relaxed (adapt_count, adapt_count_val - 1);
>> +
>> +      lll_unlock ((*futex), private);
>> +    }
>>    return 0;
>>  }
>> diff --git a/sysdeps/unix/sysv/linux/s390/lowlevellock.h b/sysdeps/unix/sysv/linux/s390/lowlevellock.h
>> index 8d564ed..c60f4f7 100644
>> --- a/sysdeps/unix/sysv/linux/s390/lowlevellock.h
>> +++ b/sysdeps/unix/sysv/linux/s390/lowlevellock.h
>> @@ -50,7 +50,7 @@ extern int __lll_timedlock_elision
>>  extern int __lll_lock_elision (int *futex, short *adapt_count, int private)
>>    attribute_hidden;
>>
>> -extern int __lll_unlock_elision(int *futex, int private)
>> +extern int __lll_unlock_elision(int *futex, short *adapt_count, int private)
>>    attribute_hidden;
>>
>>  extern int __lll_trylock_elision(int *futex, short *adapt_count)
>> @@ -59,7 +59,7 @@ extern int __lll_trylock_elision(int *futex, short *adapt_count)
>>  # define lll_lock_elision(futex, adapt_count, private) \
>>    __lll_lock_elision (&(futex), &(adapt_count), private)
>>  #  define lll_unlock_elision(futex, adapt_count, private) \
>> -  __lll_unlock_elision (&(futex), private)
>> +  __lll_unlock_elision (&(futex), &(adapt_count), private)
>>  #  define lll_trylock_elision(futex, adapt_count) \
>>    __lll_trylock_elision(&(futex), &(adapt_count))
>>  #endif  /* ENABLE_LOCK_ELISION */
>
>
>



[-- Attachment #2: 20170117_4.patch --]
[-- Type: text/x-patch, Size: 7966 bytes --]

diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
index faa998e..0081537 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
@@ -50,30 +50,28 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
      critical section uses lock elision) and outside of transactions.  Thus,
      we need to use atomic accesses to avoid data races.  However, the
      value of adapt_count is just a hint, so relaxed MO accesses are
-     sufficient.
-     Do not begin a transaction if another cpu has locked the
-     futex with normal locking.  If adapt_count is zero, it remains and the
-     next pthread_mutex_lock call will try to start a transaction again.  */
-    if (atomic_load_relaxed (futex) == 0
-	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
+     sufficient.  */
+    if (atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
     {
       /* Start a transaction and retry it automatically if it aborts with
 	 _HTM_TBEGIN_TRANSIENT.  This macro calls tbegin at most retry_cnt
 	 + 1 times.  The second argument is considered as retry_cnt.  */
       int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);
-      if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
-			    _HTM_TBEGIN_STARTED))
+      if (__glibc_likely (status == _HTM_TBEGIN_STARTED))
 	{
 	  /* Check the futex to make sure nobody has touched it in the
 	     mean time.  This forces the futex into the cache and makes
-	     sure the transaction aborts if some other cpu uses the
-	     lock (writes the futex).  */
-	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
+	     sure the transaction aborts if another thread acquires the lock
+	     concurrently.  */
+	  if (__glibc_likely (atomic_load_relaxed (futex) == 0))
 	    /* Lock was free.  Return to user code in a transaction.  */
 	    return 0;
 
-	  /* Lock was busy.  Fall back to normal locking.  */
-	  if (__builtin_expect (__libc_tx_nesting_depth () <= 1, 1))
+	  /* Lock was busy.  Fall back to normal locking.
+	     This can be the case if e.g. adapt_count was decremented to zero
+	     by a former release and another thread has been waken up and
+	     acquired it.  */
+	  if (__glibc_likely (__libc_tx_nesting_depth () <= 1))
 	    {
 	      /* In a non-nested transaction there is no need to abort,
 		 which is expensive.  Simply end the started transaction.  */
diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
index eec172a..aa09073 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
@@ -51,31 +51,29 @@ __lll_trylock_elision (int *futex, short *adapt_count)
      critical section uses lock elision) and outside of transactions.  Thus,
      we need to use atomic accesses to avoid data races.  However, the
      value of adapt_count is just a hint, so relaxed MO accesses are
-     sufficient.
-     Do not begin a transaction if another cpu has locked the
-     futex with normal locking.  If adapt_count is zero, it remains and the
-     next pthread_mutex_lock call will try to start a transaction again.  */
-    if (atomic_load_relaxed (futex) == 0
-	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
+     sufficient.  */
+    if (atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
     {
       int status = __libc_tbegin ((void *) 0);
-      if (__builtin_expect (status  == _HTM_TBEGIN_STARTED,
-			    _HTM_TBEGIN_STARTED))
+      if (__glibc_likely (status  == _HTM_TBEGIN_STARTED))
 	{
 	  /* Check the futex to make sure nobody has touched it in the
 	     mean time.  This forces the futex into the cache and makes
-	     sure the transaction aborts if some other cpu uses the
-	     lock (writes the futex).  */
-	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
+	     sure the transaction aborts if another thread acquires the lock
+	     concurrently.  */
+	  if (__glibc_likely (atomic_load_relaxed (futex) == 0))
 	    /* Lock was free.  Return to user code in a transaction.  */
 	    return 0;
 
-	  /* Lock was busy.  Fall back to normal locking.  Since we are in
-	     a non-nested transaction there is no need to abort, which is
-	     expensive.  Simply end the started transaction.  */
+	  /* Lock was busy.  Fall back to normal locking.
+	     This can be the case if e.g. adapt_count was decremented to zero
+	     by a former release and another thread has been waken up and
+	     acquired it.
+	     Since we are in a non-nested transaction there is no need to abort,
+	     which is expensive.  Simply end the started transaction.  */
 	  __libc_tend ();
 	  /* Note: Changing the adapt_count here might abort a transaction on a
-	     different cpu, but that could happen anyway when the futex is
+	     different CPU, but that could happen anyway when the futex is
 	     acquired, so there's no need to check the nesting depth here.
 	     See above for why relaxed MO is sufficient.  */
 	  if (aconf.skip_lock_busy > 0)
diff --git a/sysdeps/unix/sysv/linux/s390/elision-unlock.c b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
index d9205fa..c062d71 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-unlock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
@@ -26,8 +26,12 @@ __lll_unlock_elision(int *futex, short *adapt_count, int private)
   /* If the lock is free, we elided the lock earlier.  This does not
      necessarily mean that we are in a transaction, because the user code may
      have closed the transaction, but that is impossible to detect reliably.
-     Relaxed MO access to futex is sufficient as we only need a hint, if we
-     started a transaction or acquired the futex in e.g. elision-lock.c.  */
+     Relaxed MO access to futex is sufficient because a correct program
+     will only release a lock it has acquired; therefore, it must either
+     changed the futex word's value to something !=0 or it must have used
+     elision; these are actions by the same thread, so these actions are
+     sequenced-before the relaxed load (and thus also happens-before the
+     relaxed load).  Therefore, relaxed MO is sufficient.  */
   if (atomic_load_relaxed (futex) == 0)
     {
       __libc_tend ();
@@ -36,17 +40,17 @@ __lll_unlock_elision(int *futex, short *adapt_count, int private)
     {
       /* Update the adapt_count while unlocking before completing the critical
 	 section.  adapt_count is accessed concurrently outside of a
-	 transaction or an aquired lock e.g. in elision-lock.c so we need to use
-	 atomic accesses.  However, the value of adapt_count is just a hint, so
-	 relaxed MO accesses are sufficient.
+	 transaction or a critical section (e.g. in elision-lock.c). So we need
+	 to use atomic accesses.  However, the value of adapt_count is just a
+	 hint, so relaxed MO accesses are sufficient.
 	 If adapt_count would be decremented while locking, multiple
-	 CPUs trying to lock the locked mutex will decrement adapt_count to
+	 CPUs, trying to lock the acquired mutex, will decrement adapt_count to
 	 zero and another CPU will try to start a transaction, which will be
 	 immediately aborted as the mutex is locked.
-	 If adapt_count would be decremented while unlocking after completing
-	 the critical section, possible waiters will be waked up before
-	 decrementing the adapt_count.  Those waked up waiters could have
-	 destroyed and freed this mutex!  */
+	 The update of adapt_count is done before releasing the lock as POSIX'
+	 mutex destruction requirements disallow accesses to the mutex after it
+	 has been released and thus could have been acquired or destroyed by
+	 another thread.  */
       short adapt_count_val = atomic_load_relaxed (adapt_count);
       if (adapt_count_val > 0)
 	atomic_store_relaxed (adapt_count, adapt_count_val - 1);


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin.
  2017-01-17 15:28     ` Stefan Liebler
@ 2017-01-17 17:44       ` Torvald Riegel
  0 siblings, 0 replies; 35+ messages in thread
From: Torvald Riegel @ 2017-01-17 17:44 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Tue, 2017-01-17 at 16:28 +0100, Stefan Liebler wrote:
> I've attached the diff here and will later make one patch with changelog 
> for this and the other two patches.

Both diffs for patch 2/4 are OK.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c.
  2017-01-17 15:28     ` Stefan Liebler
@ 2017-01-17 17:45       ` Torvald Riegel
  0 siblings, 0 replies; 35+ messages in thread
From: Torvald Riegel @ 2017-01-17 17:45 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Tue, 2017-01-17 at 16:28 +0100, Stefan Liebler wrote:
> I've attached the diff here and will later make one patch with changelog 
> for this and the other two patches.

This diff is OK.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock.
  2017-01-17 15:28     ` Stefan Liebler
@ 2017-01-17 18:53       ` Torvald Riegel
  2017-01-18 12:28         ` Stefan Liebler
  0 siblings, 1 reply; 35+ messages in thread
From: Torvald Riegel @ 2017-01-17 18:53 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Tue, 2017-01-17 at 16:28 +0100, Stefan Liebler wrote:
> On 01/11/2017 11:53 AM, Torvald Riegel wrote:
> > On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
> >> This patch decrements the adapt_count while unlocking the futex
> >> instead of before aquiring the futex as it is done on power, too.
> >> Furthermore a transaction is only started if the futex is currently free.
> >> This check is done after starting the transaction, too.
> >> If the futex is not free and the transaction nesting depth is one,
> >> we can simply end the started transaction instead of aborting it.
> >> The implementation of this check was faulty as it always ended the
> >> started transaction.  By using the fallback path, the the outermost
> >> transaction was aborted.  Now the outermost transaction is aborted
> >> directly.
> >>
> >> This patch also adds some commentary and aligns the code in
> >> elision-trylock.c to the code in elision-lock.c as possible.
> >
> > I don't think this is quite ready yet.  See below for details.
> >
> > I'm not too concerned about this fact, given that it's just in
> > s390-specific code.  But generally, I'd prefer if arch-specific code
> > aims for the same quality and level of consensus about it as what is our
> > aim for generic code.
> >
> >> ChangeLog:
> >>
> >> 	* sysdeps/unix/sysv/linux/s390/lowlevellock.h
> >> 	(__lll_unlock_elision, lll_unlock_elision): Add adapt_count argument.
> >> 	* sysdeps/unix/sysv/linux/s390/elision-lock.c:
> >> 	(__lll_lock_elision): Decrement adapt_count while unlocking
> >> 	instead of before locking.
> >> 	* sysdeps/unix/sysv/linux/s390/elision-trylock.c
> >> 	(__lll_trylock_elision): Likewise.
> >> 	* sysdeps/unix/sysv/linux/s390/elision-unlock.c:
> >> 	(__lll_unlock_elision): Likewise.
> >> ---
> >>  sysdeps/unix/sysv/linux/s390/elision-lock.c    | 37 ++++++++-------
> >>  sysdeps/unix/sysv/linux/s390/elision-trylock.c | 62 ++++++++++++++------------
> >>  sysdeps/unix/sysv/linux/s390/elision-unlock.c  | 29 ++++++++++--
> >>  sysdeps/unix/sysv/linux/s390/lowlevellock.h    |  4 +-
> >>  4 files changed, 78 insertions(+), 54 deletions(-)
> >>
> >> diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> >> index 3dd7fbc..4a7d546 100644
> >> --- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
> >> +++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> >> @@ -50,31 +50,30 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
> >>       critical section uses lock elision) and outside of transactions.  Thus,
> >>       we need to use atomic accesses to avoid data races.  However, the
> >>       value of adapt_count is just a hint, so relaxed MO accesses are
> >> -     sufficient.  */
> >> -  if (atomic_load_relaxed (adapt_count) > 0)
> >> -    {
> >> -      /* Lost updates are possible, but harmless.  Due to races this might lead
> >> -	 to *adapt_count becoming less than zero.  */
> >> -      atomic_store_relaxed (adapt_count,
> >> -			    atomic_load_relaxed (adapt_count) - 1);
> >> -      goto use_lock;
> >> -    }
> >> -
> >> -  if (aconf.try_tbegin > 0)
> >> +     sufficient.
> >> +     Do not begin a transaction if another cpu has locked the
> >> +     futex with normal locking.  If adapt_count is zero, it remains and the
> >> +     next pthread_mutex_lock call will try to start a transaction again.  */
> >
> > This seems to make an assumption about performance that should be
> > explained in the comment.  IIRC, x86 LE does not make this assumption,
> > so it's not generally true.  I suppose s390 aborts are really expensive,
> > and you don't expect that a lock is in the acquired state often enough
> > so that aborts are overall more costly than the overhead of the
> > additional load and branch?
> >
> Yes, aborting a transaction is expensive.
> But you are right, there is an additional load and branch and this 
> thread will anyway wait in LLL_LOCK for another thread releasing the futex.
> See example below.

Note that I'm not actually arguing against having the check -- I just
want a clear explanation in the code including the assumptions about
performance that motivate this particular choice.

If we don't add this information, things will get messy.  And a future
maintainer will not be aware of these assumptions.

> >> +	 atomic accesses.  However, the value of adapt_count is just a hint, so
> >> +	 relaxed MO accesses are sufficient.
> >> +	 If adapt_count would be decremented while locking, multiple
> >> +	 CPUs trying to lock the locked mutex will decrement adapt_count to
> >> +	 zero and another CPU will try to start a transaction, which will be
> >> +	 immediately aborted as the mutex is locked.
> >
> > I don't think this is necessarily the case.  It is true that if more
> > than one thread decrements, only one would immediately try to use
> > elision (because only one decrements from 1 (ignoring lost updates)).
> 
> > However, if you decrement in the critical section, and lock acquisitions
> > wait until the lock is free *before* loading adapt_count and choosing
> > whether to use elision or not, then it shouldn't matter whether you
> > decrement closer to the lock acquisition or closer to the release.
> Waiting for a free lock is done by futex-syscall within LLL_LOCK (see 
> __lll_lock_wait/__lll_lock_wait_private).
> On wakeup the lock is immediately acquired if it is free.
> Afterwards there is no loading of adapt_count and no decision whether to 
> use elision or not.
> Following your suggestion means, that we need a further implementation 
> like in __lll_lock_wait/__lll_lock_wait_private in order to wait for a 
> free lock and then load adapt_count and choose to elide or not!

Well, what would be required is that after we block using futexes, we
reassess the situation (including potentially using elision), instead of
just proceeding to try to acquire the lock without elision.

> 
> Please have a look at the following example assuming:
> adapt_count = 1;
> There are two scenarios below: Imagine that only "Thread 3a" or "Thread 
> 3b" is used.
> 
> Decrement adapt_count while locking (without this patch):
> -Thread 1 __lll_lock_elision:
> decrements adapt_count to 0 and acquires the lock via LLL_LOCK.
> -Thread 2 __lll_lock_elision:
> starts a transaction and ends / aborts it immediately as lock is 
> acquired. adapt_count is set to 3. LLL_LOCK waits until lock is released 
> by Thread 1.
> -Thread 1 __lll_unlock_elision:
> releases lock.
> -Thread 2 __lll_lock_elision:
> wakes up and aquires lock via waiting LLL_LOCK.
> -Thread 3a __lll_lock_elision:
> decrements adapt_count to 2 and waits via LLL_LOCK until lock is 
> released by Thread 2.
> -Thread 2 __lll_unlock_elision:
> releases lock.
> -Thread 3b __lll_lock_elision:
> decrements adapt_count to 2 and acquires lock via LLL_LOCK.
> 
> Decrement adapt_count while unlocking (with this patch):
> -Thread 1 __lll_lock_elision:
> acquires the lock via LLL_LOCK. adapt_count remains 1.
> -Thread 2 __lll_lock_elision:
> LLL_LOCK is used as futex is acquired by Thread 1 or adapt_count > 0. It 
> waits until lock is released by Thread 1.
> -Thread 1 __lll_unlock_elision:
> decrements adapt_count to 0 and releases the lock.
> -Thread 2 __lll_lock_elision:
> wakes up and acquires lock via waiting LLL_LOCK.
> -Thread 3a __lll_lock_elision:
> LLL_LOCK is used as futex is acquired by Thread 2.
> -Thread 2 __lll_unlock_elision:
> releases lock. adapt_count remains 0.
> -Thread 3b __lll_lock_elision:
> starts a transaction.

I agree that if we do NOT wait for the lock to become free before
checking whether we can use elision and do NOT try to use elision after
we blocked using futexes, then decrementing closer to the release of a
mutex can decrease the window in which another thread can observe
adapt_count==0 but lock!=0.

This may have required more changes, but it would have been cleaner
overall, and I guess we would have ended up with less s390-specific
code.

However, it's too late for this now, given that we're past the freeze.
We should get back to this when master opens up for normal development.

> If futex is not tested before starting a transaction,
> the additional load and branch is not needed, Thread 3a will start and 
> abort a transaction, set adapt_count to 3 and will wait via LLL_LOCK.
> In case Thread 3b, a transaction will be started.
> 
> The attached diff removes the futex==0 test.
> Later I will make one patch with changelog for this and the other two 
> patches.

See above, I didn't request to remove it, but just to clearly document
why you included it.  If you think removing it is just as fine, that's
OK with me too.

> >
> > I think this needs a more thorough analysis (including better
> > documentation) and/or a microbenchmark.

Regarding the patch:

+	     Since we are in a non-nested transaction there is no need to abort,
+	     which is expensive.  Simply end the started transaction.  */

Is that actually true?  You don't seem to check whether you are indeed
not in a nested transaction.  The difference is that you do not need to
acquire the lock because this is trylock, so you can run a little
further as a transaction.  You will get aborted as soon the lock you
failed to lock is released though.
So, it's not quite clear to me what the expected win is; are you aiming
at programs that don't actually need to acquire the lock and whose
transactions finish quickly enough to not get aborted by a concurrent
release of the lock?

The rest of the diff looked OK.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock.
  2017-01-17 18:53       ` Torvald Riegel
@ 2017-01-18 12:28         ` Stefan Liebler
  2017-01-18 16:25           ` Torvald Riegel
  0 siblings, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2017-01-18 12:28 UTC (permalink / raw)
  To: libc-alpha

On 01/17/2017 07:52 PM, Torvald Riegel wrote:
> On Tue, 2017-01-17 at 16:28 +0100, Stefan Liebler wrote:
>> On 01/11/2017 11:53 AM, Torvald Riegel wrote:
>>> On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
>>>> This patch decrements the adapt_count while unlocking the futex
>>>> instead of before aquiring the futex as it is done on power, too.
>>>> Furthermore a transaction is only started if the futex is currently free.
>>>> This check is done after starting the transaction, too.
>>>> If the futex is not free and the transaction nesting depth is one,
>>>> we can simply end the started transaction instead of aborting it.
>>>> The implementation of this check was faulty as it always ended the
>>>> started transaction.  By using the fallback path, the the outermost
>>>> transaction was aborted.  Now the outermost transaction is aborted
>>>> directly.
>>>>
>>>> This patch also adds some commentary and aligns the code in
>>>> elision-trylock.c to the code in elision-lock.c as possible.
>>>
>>> I don't think this is quite ready yet.  See below for details.
>>>
>>> I'm not too concerned about this fact, given that it's just in
>>> s390-specific code.  But generally, I'd prefer if arch-specific code
>>> aims for the same quality and level of consensus about it as what is our
>>> aim for generic code.
>>>
>>>> ChangeLog:
>>>>
>>>> 	* sysdeps/unix/sysv/linux/s390/lowlevellock.h
>>>> 	(__lll_unlock_elision, lll_unlock_elision): Add adapt_count argument.
>>>> 	* sysdeps/unix/sysv/linux/s390/elision-lock.c:
>>>> 	(__lll_lock_elision): Decrement adapt_count while unlocking
>>>> 	instead of before locking.
>>>> 	* sysdeps/unix/sysv/linux/s390/elision-trylock.c
>>>> 	(__lll_trylock_elision): Likewise.
>>>> 	* sysdeps/unix/sysv/linux/s390/elision-unlock.c:
>>>> 	(__lll_unlock_elision): Likewise.
>>>> ---
>>>>  sysdeps/unix/sysv/linux/s390/elision-lock.c    | 37 ++++++++-------
>>>>  sysdeps/unix/sysv/linux/s390/elision-trylock.c | 62 ++++++++++++++------------
>>>>  sysdeps/unix/sysv/linux/s390/elision-unlock.c  | 29 ++++++++++--
>>>>  sysdeps/unix/sysv/linux/s390/lowlevellock.h    |  4 +-
>>>>  4 files changed, 78 insertions(+), 54 deletions(-)
>>>>
>>>> diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
>>>> index 3dd7fbc..4a7d546 100644
>>>> --- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
>>>> +++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
>>>> @@ -50,31 +50,30 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
>>>>       critical section uses lock elision) and outside of transactions.  Thus,
>>>>       we need to use atomic accesses to avoid data races.  However, the
>>>>       value of adapt_count is just a hint, so relaxed MO accesses are
>>>> -     sufficient.  */
>>>> -  if (atomic_load_relaxed (adapt_count) > 0)
>>>> -    {
>>>> -      /* Lost updates are possible, but harmless.  Due to races this might lead
>>>> -	 to *adapt_count becoming less than zero.  */
>>>> -      atomic_store_relaxed (adapt_count,
>>>> -			    atomic_load_relaxed (adapt_count) - 1);
>>>> -      goto use_lock;
>>>> -    }
>>>> -
>>>> -  if (aconf.try_tbegin > 0)
>>>> +     sufficient.
>>>> +     Do not begin a transaction if another cpu has locked the
>>>> +     futex with normal locking.  If adapt_count is zero, it remains and the
>>>> +     next pthread_mutex_lock call will try to start a transaction again.  */
>>>
>>> This seems to make an assumption about performance that should be
>>> explained in the comment.  IIRC, x86 LE does not make this assumption,
>>> so it's not generally true.  I suppose s390 aborts are really expensive,
>>> and you don't expect that a lock is in the acquired state often enough
>>> so that aborts are overall more costly than the overhead of the
>>> additional load and branch?
>>>
>> Yes, aborting a transaction is expensive.
>> But you are right, there is an additional load and branch and this
>> thread will anyway wait in LLL_LOCK for another thread releasing the futex.
>> See example below.
>
> Note that I'm not actually arguing against having the check -- I just
> want a clear explanation in the code including the assumptions about
> performance that motivate this particular choice.
>
> If we don't add this information, things will get messy.  And a future
> maintainer will not be aware of these assumptions.
>
>>>> +	 atomic accesses.  However, the value of adapt_count is just a hint, so
>>>> +	 relaxed MO accesses are sufficient.
>>>> +	 If adapt_count would be decremented while locking, multiple
>>>> +	 CPUs trying to lock the locked mutex will decrement adapt_count to
>>>> +	 zero and another CPU will try to start a transaction, which will be
>>>> +	 immediately aborted as the mutex is locked.
>>>
>>> I don't think this is necessarily the case.  It is true that if more
>>> than one thread decrements, only one would immediately try to use
>>> elision (because only one decrements from 1 (ignoring lost updates)).
>>
>>> However, if you decrement in the critical section, and lock acquisitions
>>> wait until the lock is free *before* loading adapt_count and choosing
>>> whether to use elision or not, then it shouldn't matter whether you
>>> decrement closer to the lock acquisition or closer to the release.
>> Waiting for a free lock is done by futex-syscall within LLL_LOCK (see
>> __lll_lock_wait/__lll_lock_wait_private).
>> On wakeup the lock is immediately acquired if it is free.
>> Afterwards there is no loading of adapt_count and no decision whether to
>> use elision or not.
>> Following your suggestion means, that we need a further implementation
>> like in __lll_lock_wait/__lll_lock_wait_private in order to wait for a
>> free lock and then load adapt_count and choose to elide or not!
>
> Well, what would be required is that after we block using futexes, we
> reassess the situation (including potentially using elision), instead of
> just proceeding to try to acquire the lock without elision.
>
If the lock is released and we have been waking up.
Do we know if other threads are also waiting for the futex?
The strategy of __lll_lock_wait is to keep futex==2 and wakup possible 
waiters with __lll_unlock.

>>
>> Please have a look at the following example assuming:
>> adapt_count = 1;
>> There are two scenarios below: Imagine that only "Thread 3a" or "Thread
>> 3b" is used.
>>
>> Decrement adapt_count while locking (without this patch):
>> -Thread 1 __lll_lock_elision:
>> decrements adapt_count to 0 and acquires the lock via LLL_LOCK.
>> -Thread 2 __lll_lock_elision:
>> starts a transaction and ends / aborts it immediately as lock is
>> acquired. adapt_count is set to 3. LLL_LOCK waits until lock is released
>> by Thread 1.
>> -Thread 1 __lll_unlock_elision:
>> releases lock.
>> -Thread 2 __lll_lock_elision:
>> wakes up and aquires lock via waiting LLL_LOCK.
>> -Thread 3a __lll_lock_elision:
>> decrements adapt_count to 2 and waits via LLL_LOCK until lock is
>> released by Thread 2.
>> -Thread 2 __lll_unlock_elision:
>> releases lock.
>> -Thread 3b __lll_lock_elision:
>> decrements adapt_count to 2 and acquires lock via LLL_LOCK.
>>
>> Decrement adapt_count while unlocking (with this patch):
>> -Thread 1 __lll_lock_elision:
>> acquires the lock via LLL_LOCK. adapt_count remains 1.
>> -Thread 2 __lll_lock_elision:
>> LLL_LOCK is used as futex is acquired by Thread 1 or adapt_count > 0. It
>> waits until lock is released by Thread 1.
>> -Thread 1 __lll_unlock_elision:
>> decrements adapt_count to 0 and releases the lock.
>> -Thread 2 __lll_lock_elision:
>> wakes up and acquires lock via waiting LLL_LOCK.
>> -Thread 3a __lll_lock_elision:
>> LLL_LOCK is used as futex is acquired by Thread 2.
>> -Thread 2 __lll_unlock_elision:
>> releases lock. adapt_count remains 0.
>> -Thread 3b __lll_lock_elision:
>> starts a transaction.
>
> I agree that if we do NOT wait for the lock to become free before
> checking whether we can use elision and do NOT try to use elision after
> we blocked using futexes, then decrementing closer to the release of a
> mutex can decrease the window in which another thread can observe
> adapt_count==0 but lock!=0.
>
> This may have required more changes, but it would have been cleaner
> overall, and I guess we would have ended up with less s390-specific
> code.
>
> However, it's too late for this now, given that we're past the freeze.
> We should get back to this when master opens up for normal development.
>
>> If futex is not tested before starting a transaction,
>> the additional load and branch is not needed, Thread 3a will start and
>> abort a transaction, set adapt_count to 3 and will wait via LLL_LOCK.
>> In case Thread 3b, a transaction will be started.
>>
>> The attached diff removes the futex==0 test.
>> Later I will make one patch with changelog for this and the other two
>> patches.
>
> See above, I didn't request to remove it, but just to clearly document
> why you included it.  If you think removing it is just as fine, that's
> OK with me too.
>
Removing it is fine.
>>>
>>> I think this needs a more thorough analysis (including better
>>> documentation) and/or a microbenchmark.
>
> Regarding the patch:
>
> +	     Since we are in a non-nested transaction there is no need to abort,
> +	     which is expensive.  Simply end the started transaction.  */
>
> Is that actually true?  You don't seem to check whether you are indeed
> not in a nested transaction.  The difference is that you do not need to
> acquire the lock because this is trylock, so you can run a little
> further as a transaction.  You will get aborted as soon the lock you
> failed to lock is released though.
> So, it's not quite clear to me what the expected win is; are you aiming
> at programs that don't actually need to acquire the lock and whose
> transactions finish quickly enough to not get aborted by a concurrent
> release of the lock?
>
Every time we enter __lll_trylock_elision, we check if we are in a 
transaction on this CPU and abort it if needed. This is needed to detect 
multiple elided trylock calls of one thread.
Afterwards we start a transaction (if adapt_count permits it) and then 
we check the futex.
If it is free, we return within a transaction. If another thread 
acquired the lock with a transaction, we can't detect it here. But a 
conflict will abort the transactions.
If the futex is already acquired, the call shall return immediately with 
an error. Thus the current transaction, which was started before, is 
ended. You are right, if the other thread releases the lock, the 
transaction will be aborted.
As ending a transaction is faster than aborting, the transaction is 
ended. The power implementation also ends it. On intel it is aborted, 
but according to the comment, an abort is used instead of an end due to 
visibility reasons while profiling.


> The rest of the diff looked OK.
>
If it's okay, I will make one patch with all the reviewed diffs and post 
it with a ChangeLog in this mail-thread.
Then I would commit it before release to have a consistent state?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock.
  2017-01-18 12:28         ` Stefan Liebler
@ 2017-01-18 16:25           ` Torvald Riegel
  2017-01-19 11:14             ` Stefan Liebler
  0 siblings, 1 reply; 35+ messages in thread
From: Torvald Riegel @ 2017-01-18 16:25 UTC (permalink / raw)
  To: Stefan Liebler; +Cc: libc-alpha

On Wed, 2017-01-18 at 13:28 +0100, Stefan Liebler wrote:
> On 01/17/2017 07:52 PM, Torvald Riegel wrote:
> > On Tue, 2017-01-17 at 16:28 +0100, Stefan Liebler wrote:
> >> On 01/11/2017 11:53 AM, Torvald Riegel wrote:
> >>> On Tue, 2016-12-06 at 14:51 +0100, Stefan Liebler wrote:
> >>>> This patch decrements the adapt_count while unlocking the futex
> >>>> instead of before aquiring the futex as it is done on power, too.
> >>>> Furthermore a transaction is only started if the futex is currently free.
> >>>> This check is done after starting the transaction, too.
> >>>> If the futex is not free and the transaction nesting depth is one,
> >>>> we can simply end the started transaction instead of aborting it.
> >>>> The implementation of this check was faulty as it always ended the
> >>>> started transaction.  By using the fallback path, the the outermost
> >>>> transaction was aborted.  Now the outermost transaction is aborted
> >>>> directly.
> >>>>
> >>>> This patch also adds some commentary and aligns the code in
> >>>> elision-trylock.c to the code in elision-lock.c as possible.
> >>>
> >>> I don't think this is quite ready yet.  See below for details.
> >>>
> >>> I'm not too concerned about this fact, given that it's just in
> >>> s390-specific code.  But generally, I'd prefer if arch-specific code
> >>> aims for the same quality and level of consensus about it as what is our
> >>> aim for generic code.
> >>>
> >>>> ChangeLog:
> >>>>
> >>>> 	* sysdeps/unix/sysv/linux/s390/lowlevellock.h
> >>>> 	(__lll_unlock_elision, lll_unlock_elision): Add adapt_count argument.
> >>>> 	* sysdeps/unix/sysv/linux/s390/elision-lock.c:
> >>>> 	(__lll_lock_elision): Decrement adapt_count while unlocking
> >>>> 	instead of before locking.
> >>>> 	* sysdeps/unix/sysv/linux/s390/elision-trylock.c
> >>>> 	(__lll_trylock_elision): Likewise.
> >>>> 	* sysdeps/unix/sysv/linux/s390/elision-unlock.c:
> >>>> 	(__lll_unlock_elision): Likewise.
> >>>> ---
> >>>>  sysdeps/unix/sysv/linux/s390/elision-lock.c    | 37 ++++++++-------
> >>>>  sysdeps/unix/sysv/linux/s390/elision-trylock.c | 62 ++++++++++++++------------
> >>>>  sysdeps/unix/sysv/linux/s390/elision-unlock.c  | 29 ++++++++++--
> >>>>  sysdeps/unix/sysv/linux/s390/lowlevellock.h    |  4 +-
> >>>>  4 files changed, 78 insertions(+), 54 deletions(-)
> >>>>
> >>>> diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> >>>> index 3dd7fbc..4a7d546 100644
> >>>> --- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
> >>>> +++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
> >>>> @@ -50,31 +50,30 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
> >>>>       critical section uses lock elision) and outside of transactions.  Thus,
> >>>>       we need to use atomic accesses to avoid data races.  However, the
> >>>>       value of adapt_count is just a hint, so relaxed MO accesses are
> >>>> -     sufficient.  */
> >>>> -  if (atomic_load_relaxed (adapt_count) > 0)
> >>>> -    {
> >>>> -      /* Lost updates are possible, but harmless.  Due to races this might lead
> >>>> -	 to *adapt_count becoming less than zero.  */
> >>>> -      atomic_store_relaxed (adapt_count,
> >>>> -			    atomic_load_relaxed (adapt_count) - 1);
> >>>> -      goto use_lock;
> >>>> -    }
> >>>> -
> >>>> -  if (aconf.try_tbegin > 0)
> >>>> +     sufficient.
> >>>> +     Do not begin a transaction if another cpu has locked the
> >>>> +     futex with normal locking.  If adapt_count is zero, it remains and the
> >>>> +     next pthread_mutex_lock call will try to start a transaction again.  */
> >>>
> >>> This seems to make an assumption about performance that should be
> >>> explained in the comment.  IIRC, x86 LE does not make this assumption,
> >>> so it's not generally true.  I suppose s390 aborts are really expensive,
> >>> and you don't expect that a lock is in the acquired state often enough
> >>> so that aborts are overall more costly than the overhead of the
> >>> additional load and branch?
> >>>
> >> Yes, aborting a transaction is expensive.
> >> But you are right, there is an additional load and branch and this
> >> thread will anyway wait in LLL_LOCK for another thread releasing the futex.
> >> See example below.
> >
> > Note that I'm not actually arguing against having the check -- I just
> > want a clear explanation in the code including the assumptions about
> > performance that motivate this particular choice.
> >
> > If we don't add this information, things will get messy.  And a future
> > maintainer will not be aware of these assumptions.
> >
> >>>> +	 atomic accesses.  However, the value of adapt_count is just a hint, so
> >>>> +	 relaxed MO accesses are sufficient.
> >>>> +	 If adapt_count would be decremented while locking, multiple
> >>>> +	 CPUs trying to lock the locked mutex will decrement adapt_count to
> >>>> +	 zero and another CPU will try to start a transaction, which will be
> >>>> +	 immediately aborted as the mutex is locked.
> >>>
> >>> I don't think this is necessarily the case.  It is true that if more
> >>> than one thread decrements, only one would immediately try to use
> >>> elision (because only one decrements from 1 (ignoring lost updates)).
> >>
> >>> However, if you decrement in the critical section, and lock acquisitions
> >>> wait until the lock is free *before* loading adapt_count and choosing
> >>> whether to use elision or not, then it shouldn't matter whether you
> >>> decrement closer to the lock acquisition or closer to the release.
> >> Waiting for a free lock is done by futex-syscall within LLL_LOCK (see
> >> __lll_lock_wait/__lll_lock_wait_private).
> >> On wakeup the lock is immediately acquired if it is free.
> >> Afterwards there is no loading of adapt_count and no decision whether to
> >> use elision or not.
> >> Following your suggestion means, that we need a further implementation
> >> like in __lll_lock_wait/__lll_lock_wait_private in order to wait for a
> >> free lock and then load adapt_count and choose to elide or not!
> >
> > Well, what would be required is that after we block using futexes, we
> > reassess the situation (including potentially using elision), instead of
> > just proceeding to try to acquire the lock without elision.
> >
> If the lock is released and we have been waking up.
> Do we know if other threads are also waiting for the futex?
> The strategy of __lll_lock_wait is to keep futex==2 and wakup possible 
> waiters with __lll_unlock.

We don't know if there are other threads waiting iff futex==2.  This is
because we may have shared that "state" with other threads.
However, we can force that there are no other threads waiting by waking
up all of them.  Some more coordination may be required to make all of
them aware of that fact.
This convoying in the futex==2 state is also bad irrespective of lock
elision, I believe; for example, if we have one long critical section
that forces all other short critical sections into the futex==2 state,
these will hand off ownership of the lock through the slow path, instead
of just waking them all at once so that they can use spinning to acquire
the lock with less overhead (because they are short).

Thus, revisiting this in the upcoming development cycle, and in the
generic mutex code, would be good.

> >>
> >> Please have a look at the following example assuming:
> >> adapt_count = 1;
> >> There are two scenarios below: Imagine that only "Thread 3a" or "Thread
> >> 3b" is used.
> >>
> >> Decrement adapt_count while locking (without this patch):
> >> -Thread 1 __lll_lock_elision:
> >> decrements adapt_count to 0 and acquires the lock via LLL_LOCK.
> >> -Thread 2 __lll_lock_elision:
> >> starts a transaction and ends / aborts it immediately as lock is
> >> acquired. adapt_count is set to 3. LLL_LOCK waits until lock is released
> >> by Thread 1.
> >> -Thread 1 __lll_unlock_elision:
> >> releases lock.
> >> -Thread 2 __lll_lock_elision:
> >> wakes up and aquires lock via waiting LLL_LOCK.
> >> -Thread 3a __lll_lock_elision:
> >> decrements adapt_count to 2 and waits via LLL_LOCK until lock is
> >> released by Thread 2.
> >> -Thread 2 __lll_unlock_elision:
> >> releases lock.
> >> -Thread 3b __lll_lock_elision:
> >> decrements adapt_count to 2 and acquires lock via LLL_LOCK.
> >>
> >> Decrement adapt_count while unlocking (with this patch):
> >> -Thread 1 __lll_lock_elision:
> >> acquires the lock via LLL_LOCK. adapt_count remains 1.
> >> -Thread 2 __lll_lock_elision:
> >> LLL_LOCK is used as futex is acquired by Thread 1 or adapt_count > 0. It
> >> waits until lock is released by Thread 1.
> >> -Thread 1 __lll_unlock_elision:
> >> decrements adapt_count to 0 and releases the lock.
> >> -Thread 2 __lll_lock_elision:
> >> wakes up and acquires lock via waiting LLL_LOCK.
> >> -Thread 3a __lll_lock_elision:
> >> LLL_LOCK is used as futex is acquired by Thread 2.
> >> -Thread 2 __lll_unlock_elision:
> >> releases lock. adapt_count remains 0.
> >> -Thread 3b __lll_lock_elision:
> >> starts a transaction.
> >
> > I agree that if we do NOT wait for the lock to become free before
> > checking whether we can use elision and do NOT try to use elision after
> > we blocked using futexes, then decrementing closer to the release of a
> > mutex can decrease the window in which another thread can observe
> > adapt_count==0 but lock!=0.
> >
> > This may have required more changes, but it would have been cleaner
> > overall, and I guess we would have ended up with less s390-specific
> > code.
> >
> > However, it's too late for this now, given that we're past the freeze.
> > We should get back to this when master opens up for normal development.
> >
> >> If futex is not tested before starting a transaction,
> >> the additional load and branch is not needed, Thread 3a will start and
> >> abort a transaction, set adapt_count to 3 and will wait via LLL_LOCK.
> >> In case Thread 3b, a transaction will be started.
> >>
> >> The attached diff removes the futex==0 test.
> >> Later I will make one patch with changelog for this and the other two
> >> patches.
> >
> > See above, I didn't request to remove it, but just to clearly document
> > why you included it.  If you think removing it is just as fine, that's
> > OK with me too.
> >
> Removing it is fine.
> >>>
> >>> I think this needs a more thorough analysis (including better
> >>> documentation) and/or a microbenchmark.
> >
> > Regarding the patch:
> >
> > +	     Since we are in a non-nested transaction there is no need to abort,
> > +	     which is expensive.  Simply end the started transaction.  */
> >
> > Is that actually true?  You don't seem to check whether you are indeed
> > not in a nested transaction.  The difference is that you do not need to
> > acquire the lock because this is trylock, so you can run a little
> > further as a transaction.  You will get aborted as soon the lock you
> > failed to lock is released though.
> > So, it's not quite clear to me what the expected win is; are you aiming
> > at programs that don't actually need to acquire the lock and whose
> > transactions finish quickly enough to not get aborted by a concurrent
> > release of the lock?
> >
> Every time we enter __lll_trylock_elision, we check if we are in a 
> transaction on this CPU and abort it if needed. This is needed to detect 
> multiple elided trylock calls of one thread.

You are right, sorry.  I had forgotten about that point.

> > The rest of the diff looked OK.
> >
> If it's okay, I will make one patch with all the reviewed diffs and post 
> it with a ChangeLog in this mail-thread.
> Then I would commit it before release to have a consistent state?

Yes, please commit this.



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock.
  2017-01-18 16:25           ` Torvald Riegel
@ 2017-01-19 11:14             ` Stefan Liebler
  2017-01-20  8:54               ` Stefan Liebler
  0 siblings, 1 reply; 35+ messages in thread
From: Stefan Liebler @ 2017-01-19 11:14 UTC (permalink / raw)
  To: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 1617 bytes --]

On 01/18/2017 05:25 PM, Torvald Riegel wrote:
> On Wed, 2017-01-18 at 13:28 +0100, Stefan Liebler wrote:
>> On 01/17/2017 07:52 PM, Torvald Riegel wrote:
>>> On Tue, 2017-01-17 at 16:28 +0100, Stefan Liebler wrote:
>>> The rest of the diff looked OK.
>>>
>> If it's okay, I will make one patch with all the reviewed diffs and post
>> it with a ChangeLog in this mail-thread.
>> Then I would commit it before release to have a consistent state?
>
> Yes, please commit this.
>
>
>
Okay. I'll commit this patch.
Thanks.

S390: Adjust lock elision code after review.

This patch adjusts s390 specific lock elision code after review
of the following patches:
-S390: Use own tbegin macro instead of __builtin_tbegin.
(8bfc4a2ab4bebdf86c151665aae8a266e2f18fb4)
-S390: Use new __libc_tbegin_retry macro in elision-lock.c.
(53c5c3d5ac238901c13f28a73ba05b0678094e80)
-S390: Optimize lock-elision by decrementing adapt_count at unlock.
(dd037fb3df286b7c2d0b0c6f8d02a2dd8a8e8a08)

The futex value is not tested before starting a transaction,
__glibc_likely is used instead of __builtin_expect and comments
are adjusted.

ChangeLog:

	* sysdeps/unix/sysv/linux/s390/htm.h: Adjust comments.
	* sysdeps/unix/sysv/linux/s390/elision-unlock.c: Likewise.
	* sysdeps/unix/sysv/linux/s390/elision-lock.c: Adjust comments.
	(__lll_lock_elision): Do not test futex before starting a
	transaction.  Use __glibc_likely instead of __builtin_expect.
	* sysdeps/unix/sysv/linux/s390/elision-trylock.c: Adjust comments.
	(__lll_trylock_elision): Do not test futex before starting a
	transaction.  Use __glibc_likely instead of __builtin_expect.

[-- Attachment #2: 20170119_lock_elision_review.patch --]
[-- Type: text/x-patch, Size: 11366 bytes --]

commit abdf7e3d264e56898f474405602205279895d9b6
Author: Stefan Liebler <stli@linux.vnet.ibm.com>
Date:   Thu Jan 19 10:44:18 2017 +0100

    S390: Adjust lock elision code after review.
    
    This patch adjusts s390 specific lock elision code after review
    of the following patches:
    -S390: Use own tbegin macro instead of __builtin_tbegin.
    (8bfc4a2ab4bebdf86c151665aae8a266e2f18fb4)
    -S390: Use new __libc_tbegin_retry macro in elision-lock.c.
    (53c5c3d5ac238901c13f28a73ba05b0678094e80)
    -S390: Optimize lock-elision by decrementing adapt_count at unlock.
    (dd037fb3df286b7c2d0b0c6f8d02a2dd8a8e8a08)
    
    The futex value is not tested before starting a transaction,
    __glibc_likely is used instead of __builtin_expect and comments
    are adjusted.
    
    ChangeLog:
    
    	* sysdeps/unix/sysv/linux/s390/htm.h: Adjust comments.
    	* sysdeps/unix/sysv/linux/s390/elision-unlock.c: Likewise.
    	* sysdeps/unix/sysv/linux/s390/elision-lock.c: Adjust comments.
    	(__lll_lock_elision): Do not test futex before starting a
    	transaction.  Use __glibc_likely instead of __builtin_expect.
    	* sysdeps/unix/sysv/linux/s390/elision-trylock.c: Adjust comments.
    	(__lll_trylock_elision): Do not test futex before starting a
    	transaction.  Use __glibc_likely instead of __builtin_expect.

diff --git a/sysdeps/unix/sysv/linux/s390/elision-lock.c b/sysdeps/unix/sysv/linux/s390/elision-lock.c
index a7c0fcc..0081537 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-lock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-lock.c
@@ -50,27 +50,28 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
      critical section uses lock elision) and outside of transactions.  Thus,
      we need to use atomic accesses to avoid data races.  However, the
      value of adapt_count is just a hint, so relaxed MO accesses are
-     sufficient.
-     Do not begin a transaction if another cpu has locked the
-     futex with normal locking.  If adapt_count is zero, it remains and the
-     next pthread_mutex_lock call will try to start a transaction again.  */
-    if (atomic_load_relaxed (futex) == 0
-	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
+     sufficient.  */
+    if (atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
     {
+      /* Start a transaction and retry it automatically if it aborts with
+	 _HTM_TBEGIN_TRANSIENT.  This macro calls tbegin at most retry_cnt
+	 + 1 times.  The second argument is considered as retry_cnt.  */
       int status = __libc_tbegin_retry ((void *) 0, aconf.try_tbegin - 1);
-      if (__builtin_expect (status == _HTM_TBEGIN_STARTED,
-			    _HTM_TBEGIN_STARTED))
+      if (__glibc_likely (status == _HTM_TBEGIN_STARTED))
 	{
 	  /* Check the futex to make sure nobody has touched it in the
 	     mean time.  This forces the futex into the cache and makes
-	     sure the transaction aborts if some other cpu uses the
-	     lock (writes the futex).  */
-	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
+	     sure the transaction aborts if another thread acquires the lock
+	     concurrently.  */
+	  if (__glibc_likely (atomic_load_relaxed (futex) == 0))
 	    /* Lock was free.  Return to user code in a transaction.  */
 	    return 0;
 
-	  /* Lock was busy.  Fall back to normal locking.  */
-	  if (__builtin_expect (__libc_tx_nesting_depth () <= 1, 1))
+	  /* Lock was busy.  Fall back to normal locking.
+	     This can be the case if e.g. adapt_count was decremented to zero
+	     by a former release and another thread has been waken up and
+	     acquired it.  */
+	  if (__glibc_likely (__libc_tx_nesting_depth () <= 1))
 	    {
 	      /* In a non-nested transaction there is no need to abort,
 		 which is expensive.  Simply end the started transaction.  */
@@ -118,6 +119,7 @@ __lll_lock_elision (int *futex, short *adapt_count, EXTRAARG int private)
 	}
     }
 
-  /* Use normal locking as fallback path if transaction does not succeed.  */
+  /* Use normal locking as fallback path if the transaction does not
+     succeed.  */
   return LLL_LOCK ((*futex), private);
 }
diff --git a/sysdeps/unix/sysv/linux/s390/elision-trylock.c b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
index 3c1d009..aa09073 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-trylock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-trylock.c
@@ -51,31 +51,29 @@ __lll_trylock_elision (int *futex, short *adapt_count)
      critical section uses lock elision) and outside of transactions.  Thus,
      we need to use atomic accesses to avoid data races.  However, the
      value of adapt_count is just a hint, so relaxed MO accesses are
-     sufficient.
-     Do not begin a transaction if another cpu has locked the
-     futex with normal locking.  If adapt_count is zero, it remains and the
-     next pthread_mutex_lock call will try to start a transaction again.  */
-    if (atomic_load_relaxed (futex) == 0
-	&& atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
+     sufficient.  */
+    if (atomic_load_relaxed (adapt_count) <= 0 && aconf.try_tbegin > 0)
     {
       int status = __libc_tbegin ((void *) 0);
-      if (__builtin_expect (status  == _HTM_TBEGIN_STARTED,
-			    _HTM_TBEGIN_STARTED))
+      if (__glibc_likely (status  == _HTM_TBEGIN_STARTED))
 	{
 	  /* Check the futex to make sure nobody has touched it in the
 	     mean time.  This forces the futex into the cache and makes
-	     sure the transaction aborts if some other cpu uses the
-	     lock (writes the futex).  */
-	  if (__builtin_expect (atomic_load_relaxed (futex) == 0, 1))
+	     sure the transaction aborts if another thread acquires the lock
+	     concurrently.  */
+	  if (__glibc_likely (atomic_load_relaxed (futex) == 0))
 	    /* Lock was free.  Return to user code in a transaction.  */
 	    return 0;
 
-	  /* Lock was busy.  Fall back to normal locking.  Since we are in
-	     a non-nested transaction there is no need to abort, which is
-	     expensive.  Simply end the started transaction.  */
+	  /* Lock was busy.  Fall back to normal locking.
+	     This can be the case if e.g. adapt_count was decremented to zero
+	     by a former release and another thread has been waken up and
+	     acquired it.
+	     Since we are in a non-nested transaction there is no need to abort,
+	     which is expensive.  Simply end the started transaction.  */
 	  __libc_tend ();
 	  /* Note: Changing the adapt_count here might abort a transaction on a
-	     different cpu, but that could happen anyway when the futex is
+	     different CPU, but that could happen anyway when the futex is
 	     acquired, so there's no need to check the nesting depth here.
 	     See above for why relaxed MO is sufficient.  */
 	  if (aconf.skip_lock_busy > 0)
@@ -93,6 +91,7 @@ __lll_trylock_elision (int *futex, short *adapt_count)
       /* Could do some retries here.  */
     }
 
-  /* Use normal locking as fallback path if transaction does not succeed.  */
+  /* Use normal locking as fallback path if the transaction does not
+     succeed.  */
   return lll_trylock (*futex);
 }
diff --git a/sysdeps/unix/sysv/linux/s390/elision-unlock.c b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
index d9205fa..c062d71 100644
--- a/sysdeps/unix/sysv/linux/s390/elision-unlock.c
+++ b/sysdeps/unix/sysv/linux/s390/elision-unlock.c
@@ -26,8 +26,12 @@ __lll_unlock_elision(int *futex, short *adapt_count, int private)
   /* If the lock is free, we elided the lock earlier.  This does not
      necessarily mean that we are in a transaction, because the user code may
      have closed the transaction, but that is impossible to detect reliably.
-     Relaxed MO access to futex is sufficient as we only need a hint, if we
-     started a transaction or acquired the futex in e.g. elision-lock.c.  */
+     Relaxed MO access to futex is sufficient because a correct program
+     will only release a lock it has acquired; therefore, it must either
+     changed the futex word's value to something !=0 or it must have used
+     elision; these are actions by the same thread, so these actions are
+     sequenced-before the relaxed load (and thus also happens-before the
+     relaxed load).  Therefore, relaxed MO is sufficient.  */
   if (atomic_load_relaxed (futex) == 0)
     {
       __libc_tend ();
@@ -36,17 +40,17 @@ __lll_unlock_elision(int *futex, short *adapt_count, int private)
     {
       /* Update the adapt_count while unlocking before completing the critical
 	 section.  adapt_count is accessed concurrently outside of a
-	 transaction or an aquired lock e.g. in elision-lock.c so we need to use
-	 atomic accesses.  However, the value of adapt_count is just a hint, so
-	 relaxed MO accesses are sufficient.
+	 transaction or a critical section (e.g. in elision-lock.c). So we need
+	 to use atomic accesses.  However, the value of adapt_count is just a
+	 hint, so relaxed MO accesses are sufficient.
 	 If adapt_count would be decremented while locking, multiple
-	 CPUs trying to lock the locked mutex will decrement adapt_count to
+	 CPUs, trying to lock the acquired mutex, will decrement adapt_count to
 	 zero and another CPU will try to start a transaction, which will be
 	 immediately aborted as the mutex is locked.
-	 If adapt_count would be decremented while unlocking after completing
-	 the critical section, possible waiters will be waked up before
-	 decrementing the adapt_count.  Those waked up waiters could have
-	 destroyed and freed this mutex!  */
+	 The update of adapt_count is done before releasing the lock as POSIX'
+	 mutex destruction requirements disallow accesses to the mutex after it
+	 has been released and thus could have been acquired or destroyed by
+	 another thread.  */
       short adapt_count_val = atomic_load_relaxed (adapt_count);
       if (adapt_count_val > 0)
 	atomic_store_relaxed (adapt_count, adapt_count_val - 1);
diff --git a/sysdeps/unix/sysv/linux/s390/htm.h b/sysdeps/unix/sysv/linux/s390/htm.h
index 32d5a88..70d7f66 100644
--- a/sysdeps/unix/sysv/linux/s390/htm.h
+++ b/sysdeps/unix/sysv/linux/s390/htm.h
@@ -119,11 +119,16 @@
 			      ar modification and fp operations.  Some	\
 			      program-interruptions (e.g. a null	\
 			      pointer access) are filtered and the	\
-			      trancsaction will abort.  In this case	\
+			      transaction will abort.  In this case	\
 			      the normal lock path will execute it	\
 			      again and result in a core dump wich does	\
 			      now show at tbegin but the real executed	\
-			      instruction.  */				\
+			      instruction.				\
+			      However it is not guaranteed that this	\
+			      retry operate on the same data and thus	\
+			      may not end in an program-interruption.	\
+			      Note: This could also be used to probe	\
+			      memory for being accessible!  */		\
 			   "2: tbegin 0, 0xFF0E\n\t"			\
 			   /* Branch away in abort case (this is the	\
 			      prefered sequence.  See PoP in chapter 5	\
@@ -152,7 +157,8 @@
      __ret;								\
      })
 
-/* These builtins are correct.  Use them.  */
+/* These builtins are usable in context of glibc lock elision code without any
+   changes.  Use them.  */
 #define __libc_tend()							\
   ({ __asm__ __volatile__ (".machine push\n\t"				\
 			   ".machinemode \"zarch_nohighgprs\"\n\t"	\

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock.
  2017-01-19 11:14             ` Stefan Liebler
@ 2017-01-20  8:54               ` Stefan Liebler
  0 siblings, 0 replies; 35+ messages in thread
From: Stefan Liebler @ 2017-01-20  8:54 UTC (permalink / raw)
  To: libc-alpha

On 01/19/2017 12:14 PM, Stefan Liebler wrote:
> On 01/18/2017 05:25 PM, Torvald Riegel wrote:
>> On Wed, 2017-01-18 at 13:28 +0100, Stefan Liebler wrote:
>>> On 01/17/2017 07:52 PM, Torvald Riegel wrote:
>>>> On Tue, 2017-01-17 at 16:28 +0100, Stefan Liebler wrote:
>>>> The rest of the diff looked OK.
>>>>
>>> If it's okay, I will make one patch with all the reviewed diffs and post
>>> it with a ChangeLog in this mail-thread.
>>> Then I would commit it before release to have a consistent state?
>>
>> Yes, please commit this.
>>
>>
>>
> Okay. I'll commit this patch.
> Thanks.
>
> S390: Adjust lock elision code after review.
>
> This patch adjusts s390 specific lock elision code after review
> of the following patches:
> -S390: Use own tbegin macro instead of __builtin_tbegin.
> (8bfc4a2ab4bebdf86c151665aae8a266e2f18fb4)
> -S390: Use new __libc_tbegin_retry macro in elision-lock.c.
> (53c5c3d5ac238901c13f28a73ba05b0678094e80)
> -S390: Optimize lock-elision by decrementing adapt_count at unlock.
> (dd037fb3df286b7c2d0b0c6f8d02a2dd8a8e8a08)
>
> The futex value is not tested before starting a transaction,
> __glibc_likely is used instead of __builtin_expect and comments
> are adjusted.
>
> ChangeLog:
>
>     * sysdeps/unix/sysv/linux/s390/htm.h: Adjust comments.
>     * sysdeps/unix/sysv/linux/s390/elision-unlock.c: Likewise.
>     * sysdeps/unix/sysv/linux/s390/elision-lock.c: Adjust comments.
>     (__lll_lock_elision): Do not test futex before starting a
>     transaction.  Use __glibc_likely instead of __builtin_expect.
>     * sysdeps/unix/sysv/linux/s390/elision-trylock.c: Adjust comments.
>     (__lll_trylock_elision): Do not test futex before starting a
>     transaction.  Use __glibc_likely instead of __builtin_expect.
Commited.

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2017-01-20  8:54 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-06 13:52 [PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
2016-12-06 13:52 ` [PATCH 3/4] S390: Use new __libc_tbegin_retry macro in elision-lock.c Stefan Liebler
2017-01-10 16:53   ` Torvald Riegel
2017-01-17 15:28     ` Stefan Liebler
2017-01-17 17:45       ` Torvald Riegel
2016-12-06 13:52 ` [PATCH 2/4] S390: Use own tbegin macro instead of __builtin_tbegin Stefan Liebler
2016-12-22 12:54   ` Florian Weimer
2017-01-10 12:32     ` Stefan Liebler
2017-01-10 16:34   ` Torvald Riegel
2017-01-12 15:45     ` Florian Weimer
2017-01-13 11:28       ` Torvald Riegel
2017-01-13 14:50         ` Florian Weimer
2017-01-13 15:32           ` Torvald Riegel
2017-01-13 15:34             ` Florian Weimer
2017-01-13 16:22             ` Stefan Liebler
2017-01-13 22:22               ` Torvald Riegel
2017-01-17 15:28       ` Stefan Liebler
2017-01-17 15:28     ` Stefan Liebler
2017-01-17 17:44       ` Torvald Riegel
2016-12-06 13:52 ` [PATCH 4/4] S390: Optimize lock-elision by decrementing adapt_count at unlock Stefan Liebler
2017-01-11 10:53   ` Torvald Riegel
2017-01-17 15:28     ` Stefan Liebler
2017-01-17 18:53       ` Torvald Riegel
2017-01-18 12:28         ` Stefan Liebler
2017-01-18 16:25           ` Torvald Riegel
2017-01-19 11:14             ` Stefan Liebler
2017-01-20  8:54               ` Stefan Liebler
2016-12-13  8:38 ` [PING][PATCH 1/4] S390: Use C11-like atomics instead of plain memory accesses in lock elision code Stefan Liebler
2016-12-16 17:12   ` Stefan Liebler
2016-12-16 18:14   ` Torvald Riegel
2016-12-20 14:28     ` Stefan Liebler
2016-12-22  9:34       ` Torvald Riegel
2017-01-10 11:34         ` Torvald Riegel
2017-01-11 16:06         ` Stefan Liebler
2017-01-12  9:43           ` Torvald Riegel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).