[PATCH] add self-tuning to x86 hardware fast path in libitm

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] add self-tuning to x86 hardware fast path in libitm
@ 2015-04-07 17:36 Nuno Diegues
  2015-04-08 14:05 ` Andi Kleen
  0 siblings, 1 reply; 20+ messages in thread
From: Nuno Diegues @ 2015-04-07 17:36 UTC (permalink / raw)
  To: gcc-patches

Hi,

the libitm package contains a fast path for x86 to use Intel
Restricted Transactional Memory (RTM) when available. This Hardware
Transactional Memory (HTM) requires a software-based fallback to
execute the atomic blocks when the hardware fails. This may happen
because the transaction is too large, for instance.

In fact, the performance of this HTM (and others that are equivalently
best-effort) depends significantly on that interplay between the
hardware and the software fallback. Wide experimentation showed that
there does not seem to exist a single configuration for that fallback
that can perform best independently of the application and workload
[2].

Hence, in the scope of my work I have developed a self-tuning approach
that exploits lightweight reinforcement learning techniques to
identify the optimal RTM configuration in a workload-oblivious manner,
i.e., not requiring any off-line sampling of the application.

The implementation in this patch follows closely the following work:
[1] "Self-Tuning Intel Transactional Synchronization Extensions", Nuno
Diegues and Paolo Romano, Proceedings of the International Conference
on Autonomic Computing, ICAC 2014
http://homepages.gsd.inesc-id.pt/~ndiegues/files/icac14-ndiegues.pdf

[2] "Virtues and Limitations of Commodity Hardware Transactional
Memory", Nuno Diegues, Paolo Romano, and Luis Rodrigues, Proceedings
of the International Conference on Parallel Architectures and Compiler
Techniques, PACT 2014


The copyright assignment for this is still in progress. Also, please
bear in mind that this is my first (ever) attempt to contribute to
GCC. As such, any suggestion (as simple as it may seem) will be most
welcome.


Output of: "svn diff --extensions -u"

Index: libitm/method-serial.cc
===================================================================
--- libitm/method-serial.cc (revision 219316)
+++ libitm/method-serial.cc (working copy)
@@ -223,7 +223,23 @@ struct htm_mg : public method_group
     // initially disabled.
 #ifdef USE_HTM_FASTPATH
     htm_fastpath = htm_init();
+#ifdef HAVE_AS_RTM
+    optimizer.optimized_mode = STUBBORN;
+    optimizer.optimized_attempts = htm_fastpath;
+    optimizer.last_cycles = rdtsc();
+    optimizer.last_total_txs_executed = 0;
+    optimizer.last_throughput = 0.0;
+    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
+    optimizer.best_ever_throughput = 0.0;
+    optimizer.best_ever_attempts = htm_fastpath;
+    optimizer.txns_while_stubborn = 1;
+    optimizer.cycles_while_stubborn = 100;
+    optimizer.txns_while_halven = 1;
+    optimizer.cycles_while_halven = 100;
+    optimizer.txns_while_giveup = 1;
+    optimizer.cycles_while_giveup = 100;
 #endif
+#endif
   }
   virtual void fini()
   {
Index: libitm/config/x86/sjlj.S
===================================================================
--- libitm/config/x86/sjlj.S (revision 219316)
+++ libitm/config/x86/sjlj.S (working copy)
@@ -59,12 +59,14 @@
 #define pr_hasNoAbort 0x08
 #define pr_HTMRetryableAbort 0x800000
 #define pr_HTMRetriedAfterAbort 0x1000000
+#define pr_HTMCapacityAbort     0x2000000
 #define a_runInstrumentedCode 0x01
 #define a_runUninstrumentedCode 0x02
 #define a_tryHTMFastPath 0x20

 #define _XABORT_EXPLICIT (1 << 0)
 #define _XABORT_RETRY (1 << 1)
+#define _XABORT_CAPACITY (1 << 3)

  .text

@@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction):
 .Ltxn_abort:
  /* If it might make sense to retry the HTM fast path, let the C++
     code decide.  */
- testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
+ testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax
  jz .Lno_htm
  orl $pr_HTMRetryableAbort, %edi
+ testl   $(_XABORT_CAPACITY), %eax
+ jz      .Lno_htm
+ orl     $pr_HTMCapacityAbort, %edi
  /* Let the C++ code handle the retry policy.  */
 .Lno_htm:
 #endif
Index: libitm/config/x86/target.h
===================================================================
--- libitm/config/x86/target.h (revision 219316)
+++ libitm/config/x86/target.h (working copy)
@@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
   return begin_ret & _XABORT_RETRY;
 }

+static inline bool
+htm_is_capacity_abort(uint32_t begin_ret)
+{
+  return begin_ret & _XABORT_CAPACITY;
+}
+
 /* Returns true iff a hardware transaction is currently being executed.  */
 static inline bool
 htm_transaction_active ()
 {
   return _xtest() != 0;
 }
+
+static inline uint64_t rdtsc()
+{
+    uint32_t hi, lo;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
 #endif


Index: libitm/libitm_i.h
===================================================================
--- libitm/libitm_i.h (revision 219316)
+++ libitm/libitm_i.h (working copy)
@@ -242,6 +242,9 @@ struct gtm_thread
   uint32_t restart_reason[NUM_RESTARTS];
   uint32_t restart_total;

+  // Keeps track of how many transactions were successfully executed.
+  uint64_t number_executed_txns;
+
   // *** The shared part of gtm_thread starts here. ***
   // Shared state is on separate cachelines to avoid false sharing with
   // thread-local parts of gtm_thread.
@@ -286,6 +289,8 @@ struct gtm_thread
   static void *operator new(size_t);
   static void operator delete(void *);

+  static void reoptimize_htm_execution();
+
   // Invoked from assembly language, thus the "asm" specifier on
   // the name, avoiding complex name mangling.
   static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
@@ -309,6 +314,40 @@ struct gtm_thread
   void commit_user_actions ();
 };

+// Different ways to deal with capacity aborts in HTM execution.
+enum gtm_capacity_abort_mode
+{
+  STUBBORN,
+  HALVEN,
+  GIVEUP
+};
+
+// Maintains the current values optimized for HTM execution and the
+// corresponding statistics gathered for the decision-making.
+struct gtm_global_optimizer
+{
+  // Mode chosen to currently deal with capacity aborts.
+  gtm_capacity_abort_mode optimized_mode;
+  // Number of attempts chosen to currently insist on HTM execution.
+  uint32_t optimized_attempts;
+
+  uint64_t last_cycles;
+  uint64_t last_total_txs_executed;
+
+  double last_throughput;
+  uint32_t last_attempts;
+
+  double best_ever_throughput;
+  uint32_t best_ever_attempts;
+
+  uint64_t txns_while_stubborn;
+  uint64_t cycles_while_stubborn;
+  uint64_t txns_while_halven;
+  uint64_t cycles_while_halven;
+  uint64_t txns_while_giveup;
+  uint64_t cycles_while_giveup;
+};
+
 } // namespace GTM

 #include "tls.h"
@@ -346,6 +385,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
 // the name, avoiding complex name mangling.
 extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");

+// Maintains the optimization for HTM execution.
+extern gtm_global_optimizer optimizer;
+
 } // namespace GTM

 #endif // LIBITM_I_H
Index: libitm/beginend.cc
===================================================================
--- libitm/beginend.cc (revision 219316)
+++ libitm/beginend.cc (working copy)
@@ -24,8 +24,9 @@

 #include "libitm_i.h"
 #include <pthread.h>
+#include <math.h>
+//#include <cstdio>

-
 using namespace GTM;

 #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP)
@@ -39,6 +40,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
 gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
 atomic<gtm_version> GTM::gtm_clock;

+// Optimization of HTM executions.
+gtm_global_optimizer GTM::optimizer;
+
 /* ??? Move elsewhere when we figure out library initialization.  */
 uint64_t GTM::gtm_spin_count_var = 1000;

@@ -149,6 +153,147 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
     return a_runInstrumentedCode;
 }

+static inline float fastLog(float x)
+{
+  union { float f; uint32_t i; } vx = { x };
+  float y = vx.i;
+  y *= 8.2629582881927490e-8f;
+  return y - 87.989971088f;
+}
+
+static inline float fastSqrt(float x)
+{
+  union
+  {
+    int i;
+    float x;
+  } u;
+
+  u.x = x;
+  u.i = (1<<29) + (u.i >> 1) - (1<<22);
+  return u.x;
+}
+
+void
+GTM::gtm_thread::reoptimize_htm_execution()
+{
+  gtm_thread *tx = gtm_thr();
+  uint64_t total_txs_executed = 0;
+
+  // Collect the statistics obtained so far.
+  serial_lock.read_lock(tx);
+  gtm_thread *ptr = list_of_threads;
+  for (; ptr; ptr = ptr->next_thread)
+    {
+      total_txs_executed += ptr->number_executed_txns;
+    }
+  serial_lock.read_unlock(tx);
+
+  // Obtain the delta performance with respect to the last period.
+  uint64_t current_cycles = rdtsc();
+  uint64_t cycles_used = current_cycles - optimizer.last_cycles;
+  uint64_t new_txs_executed =
+    total_txs_executed - optimizer.last_total_txs_executed;
+  optimizer.last_cycles = current_cycles;
+  optimizer.last_total_txs_executed = total_txs_executed;
+  if (optimizer.optimized_mode == STUBBORN)
+    {
+      optimizer.txns_while_stubborn += new_txs_executed;
+      optimizer.cycles_while_stubborn += cycles_used;
+    }
+  else if (optimizer.optimized_mode == HALVEN)
+    {
+      optimizer.txns_while_halven += new_txs_executed;
+      optimizer.cycles_while_halven += cycles_used;
+    }
+  else
+    {
+      optimizer.txns_while_giveup += new_txs_executed;
+      optimizer.cycles_while_giveup += cycles_used;
+    }
+
+  // Compute Upper Confidence Bounds for the mode to choose next.
+  float log_sum = 2 * fastLog(static_cast<float>(
+    optimizer.txns_while_stubborn + optimizer.txns_while_halven +
+    optimizer.txns_while_giveup));
+  float ucb_stubborn = ((static_cast<float>(optimizer.txns_while_stubborn) /
+    optimizer.cycles_while_stubborn) / optimizer.best_ever_throughput) +
+    fastSqrt(log_sum / static_cast<float>(optimizer.txns_while_stubborn));
+  float ucb_halven = ((static_cast<float>(optimizer.txns_while_halven) /
+    optimizer.cycles_while_halven) / optimizer.best_ever_throughput) +
+    fastSqrt(log_sum / static_cast<float>(optimizer.txns_while_halven));
+  float ucb_giveup = ((static_cast<float>(optimizer.txns_while_giveup) /
+    optimizer.cycles_while_giveup) / optimizer.best_ever_throughput) +
+    fastSqrt(log_sum / static_cast<float>(optimizer.txns_while_giveup));
+  if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup)
+    {
+      optimizer.optimized_mode = STUBBORN;
+    }
+  else if (ucb_halven > ucb_giveup)
+    {
+      optimizer.optimized_mode = HALVEN;
+    }
+  else
+    {
+      optimizer.optimized_mode = GIVEUP;
+    }
+
+  // Helps to obtain more meaningful numbers. Not necessary for correctness.
+  const double avg_cycles_per_tx = 5000.0;
+  double current_throughput =
+    (new_txs_executed * avg_cycles_per_tx) / cycles_used;
+
+  // Compute gradient descent for the number of retries.
+  double change_for_better = current_throughput / optimizer.last_throughput;
+  double change_for_worse = optimizer.last_throughput / current_throughput;
+  int32_t last_attempts = optimizer.last_attempts;
+  int32_t current_attempts = optimizer.optimized_attempts;
+  int32_t new_attempts = current_attempts;
+  if (unlikely(change_for_worse > 1.40))
+    {
+      optimizer.optimized_attempts = optimizer.best_ever_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.last_attempts = current_attempts;
+      return;
+    }
+
+  if (unlikely(random() % 100 < 1))
+    {
+      optimizer.last_attempts = optimizer.optimized_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.optimized_attempts = random() % 18 + 2;
+      return;
+    }
+
+      if (change_for_better > 1.05)
+        {
+          new_attempts += current_attempts - last_attempts;
+          if (current_attempts == last_attempts)
+            new_attempts += random() % 2 == 0 ? -1 : 1;
+        }
+      else if (change_for_worse > 1.05)
+        {
+          new_attempts -= current_attempts - last_attempts;
+          if (current_attempts == last_attempts)
+            new_attempts += random() % 2 == 0 ? -1 : 1;
+        }
+      if (unlikely(new_attempts > 20))
+        new_attempts = 20;
+      else if (unlikely(new_attempts < 2))
+        new_attempts = 2;
+      if (current_attempts != new_attempts)
+        {
+          optimizer.last_attempts = current_attempts;
+          optimizer.last_throughput = current_throughput;
+        }
+      optimizer.optimized_attempts = new_attempts;
+      if (unlikely(current_throughput > optimizer.best_ever_throughput))
+        {
+          optimizer.best_ever_throughput = current_throughput;
+          optimizer.best_ever_attempts = current_attempts;
+        }
+}
+
 uint32_t
 GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 {
@@ -190,7 +335,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
 #ifndef HTM_CUSTOM_FASTPATH
   if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
     {
-      for (uint32_t t = htm_fastpath; t; t--)
+      for (uint32_t t = optimizer.optimized_attempts; t; t--)
  {
    uint32_t ret = htm_begin();
    if (htm_begin_success(ret))
@@ -209,19 +354,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
      }
    // The transaction has aborted.  Don't retry if it's unlikely that
    // retrying the transaction will be successful.
-   if (!htm_abort_should_retry(ret))
+#ifdef HAVE_AS_RTM
+          if (htm_is_capacity_abort(ret))
+            {
+              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+              if (capacity_mode == HALVEN)
+                t = t / 2 + 1;
+              else if (capacity_mode == GIVEUP)
+                t = 1;
+            }
+          // Only one thread performs this to avoid the need for
+          // synchronization. We use the oldest thread that is active.
+          // We also reoptimize at most once per transaction.
+          if (unlikely(tx->next_thread == NULL &&
+              tx->number_executed_txns % 500 == 0 && t == 1))
+              reoptimize_htm_execution();
+#else
+          if (!htm_abort_should_retry(ret))
      break;
+#endif
    // Wait until any concurrent serial-mode transactions have finished.
    // This is an empty critical section, but won't be elided.
    if (serial_lock.is_write_locked())
      {
-       tx = gtm_thr();
-       if (unlikely(tx == NULL))
-         {
-           // See below.
-           tx = new gtm_thread();
-           set_gtm_thr(tx);
-         }
+              tx = gtm_thr();
+              if (unlikely(tx == NULL))
+                {
+                  // See below.
+                  tx = new gtm_thread();
+                  set_gtm_thr(tx);
+                }
        // Check whether there is an enclosing serial-mode transaction;
        // if so, we just continue as a nested transaction and don't
        // try to use the HTM fastpath.  This case can happen when an
@@ -262,12 +424,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
       // other fallback will use serial transactions, which don't use
       // restart_total but will reset it when committing.
       if (!(prop & pr_HTMRetriedAfterAbort))
- tx->restart_total = htm_fastpath;
+        tx->restart_total = optimizer.optimized_attempts;

       if (--tx->restart_total > 0)
  {
    // Wait until any concurrent serial-mode transactions have finished.
    // Essentially the same code as above.
+#ifdef HAVE_AS_RTM
+          if (prop & pr_HTMCapacityAbort)
+            {
+              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+              if (capacity_mode == HALVEN)
+                tx->restart_total = tx->restart_total;
+              else if (capacity_mode == GIVEUP)
+                goto stop_custom_htm_fastpath;
+            }
+
+          if (unlikely(tx->next_thread == NULL &&
+              tx->number_executed_txns % 500 == 0 && tx->restart_total == 1))
+              reoptimize_htm_execution();
+#endif
    if (serial_lock.is_write_locked())
      {
        if (tx->nesting > 0)
@@ -665,12 +841,21 @@ _ITM_commitTransaction(void)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+        {
+          tx = new gtm_thread();
+          set_gtm_thr(tx);
+        }
+      tx->number_executed_txns++;
       return;
     }
 #endif
   gtm_thread *tx = gtm_thr();
   if (!tx->trycommit ())
     tx->restart (RESTART_VALIDATE_COMMIT);
+
+  tx->number_executed_txns++;
 }

 void ITM_REGPARM
@@ -681,6 +866,13 @@ _ITM_commitTransactionEH(void *exc_ptr)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+        {
+          tx = new gtm_thread();
+          set_gtm_thr(tx);
+        }
+      tx->number_executed_txns++;
       return;
     }
 #endif
@@ -690,4 +882,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
       tx->eh_in_flight = exc_ptr;
       tx->restart (RESTART_VALIDATE_COMMIT);
     }
+  tx->number_executed_txns++;
 }
Index: libitm/libitm.h
===================================================================
--- libitm/libitm.h (revision 219316)
+++ libitm/libitm.h (working copy)
@@ -101,7 +101,8 @@ typedef enum
    /* These are not part of the ABI but used for custom HTM fast paths.  See
       ITM_beginTransaction and gtm_thread::begin_transaction.  */
    pr_HTMRetryableAbort = 0x800000,
-   pr_HTMRetriedAfterAbort = 0x1000000
+   pr_HTMRetriedAfterAbort = 0x1000000,
+   pr_HTMCapacityAbort          = 0x2000000
 } _ITM_codeProperties;

 /* Result from startTransaction that describes what actions to take.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-07 17:36 [PATCH] add self-tuning to x86 hardware fast path in libitm Nuno Diegues
@ 2015-04-08 14:05 ` Andi Kleen
  2015-04-08 15:29   ` Nuno Diegues
  0 siblings, 1 reply; 20+ messages in thread
From: Andi Kleen @ 2015-04-08 14:05 UTC (permalink / raw)
  To: Nuno Diegues; +Cc: gcc-patches

Nuno Diegues <nmld@ist.utl.pt> writes:

What workloads did you test this on?

> +static inline float fastLog(float x)
> +{
> +  union { float f; uint32_t i; } vx = { x };
> +  float y = vx.i;
> +  y *= 8.2629582881927490e-8f;
> +  return y - 87.989971088f;
> +}
> +
> +static inline float fastSqrt(float x)
> +{
> +  union
> +  {
> +    int i;
> +    float x;
> +  } u;
> +
> +  u.x = x;
> +  u.i = (1<<29) + (u.i >> 1) - (1<<22);
> +  return u.x;
> +}

Are you sure you need floating point here? If the program does not 
use it in any other ways faulting in the floating point state can be
quite expensive. 

I bet fixed point would work for such simple purposes too.
> +  serial_lock.read_unlock(tx);
> +
> +  // Obtain the delta performance with respect to the last period.
> +  uint64_t current_cycles = rdtsc();
> +  uint64_t cycles_used = current_cycles - optimizer.last_cycles;

It may be worth pointing out that rdtsc does not return cycles.
In fact the ratio to real cycles is variable depending on the changing frequency.
I hope your algorithms can handle that.
> +
> +  // Compute gradient descent for the number of retries.
> +  double change_for_better = current_throughput / optimizer.last_throughput;
> +  double change_for_worse = optimizer.last_throughput / current_throughput;
> +  int32_t last_attempts = optimizer.last_attempts;
> +  int32_t current_attempts = optimizer.optimized_attempts;
> +  int32_t new_attempts = current_attempts;
> +  if (unlikely(change_for_worse > 1.40))
> +    {
> +      optimizer.optimized_attempts = optimizer.best_ever_attempts;
> +      optimizer.last_throughput = current_throughput;
> +      optimizer.last_attempts = current_attempts;
> +      return;
> +    }
> +
> +  if (unlikely(random() % 100 < 1))
> +    {

So where is the seed for that random stored? Could you corrupt some
user's random state? Is the state per thread or global? 
If it's per thread how do you initialize so that they threads do
start with different seeds.
If it's global what synchronizes it?

Overall the algorithm looks very complicated with many mysterious magic
numbers. Are there simplifications possible? While the retry path is not
extremely critical it should be at least somewhat optimized, otherwise
it will dominate the cost of short transactions.

One problems with so many magic numbers is that they may be good on one
system, but bad on another.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-08 14:05 ` Andi Kleen
@ 2015-04-08 15:29   ` Nuno Diegues
  2015-04-08 17:55     ` Andi Kleen
  0 siblings, 1 reply; 20+ messages in thread
From: Nuno Diegues @ 2015-04-08 15:29 UTC (permalink / raw)
  To: Andi Kleen; +Cc: gcc-patches

Thank you for the feedback. Comments inline.

On Wed, Apr 8, 2015 at 3:05 PM, Andi Kleen <andi@firstfloor.org> wrote:
>
> Nuno Diegues <nmld@ist.utl.pt> writes:
>
> What workloads did you test this on?

On the STAMP suite of benchmarks for transactional memory (described here [1]).
I have ran an unmodified GCC 5.0.0 against the patched GCC with these
modifications and obtain the following speedups in STAMP with 4
threads (on a Haswell with 4 cores, average 10 runs):

benchmarks: speedup
genome: 1.32
intruder: 1.66
labyrinth: 1.00
ssca2: 1.02
yada: 1.00
kmeans-high: 1.13
kmeans-low: 1.10
vacation-high: 2.27
vacation-low: 1.88

[1] Chi Cao Minh, JaeWoong Chung, Christos Kozyrakis, Kunle Olukotun:
STAMP: Stanford Transactional Applications for Multi-Processing. IISWC
2008: 35-46

>
> Are you sure you need floating point here? If the program does not
> use it in any other ways faulting in the floating point state can be
> quite expensive.
>
> I bet fixed point would work for such simple purposes too.

That is a good point. While I haven't ever used fixed point
arithmetic, a cursory inspection reveals that it does make sense and
seems applicable to this case.
Are you aware of some place where this is being done already within
GCC that I could use as inspiration, or should I craft some macros
from scratch for this?

> > +  serial_lock.read_unlock(tx);
> > +
> > +  // Obtain the delta performance with respect to the last period.
> > +  uint64_t current_cycles = rdtsc();
> > +  uint64_t cycles_used = current_cycles - optimizer.last_cycles;
>
> It may be worth pointing out that rdtsc does not return cycles.
> In fact the ratio to real cycles is variable depending on the changing frequency.
> I hope your algorithms can handle that.

The intent here is to obtain some notion of time passed with a low
cost. RDTSC seemed to be the best choice around: it is not critical
that the frequency of the processor may change the relativity of the
returned value with respect to actual cpu cycles.

> > +
> > +  // Compute gradient descent for the number of retries.
> > +  double change_for_better = current_throughput / optimizer.last_throughput;
> > +  double change_for_worse = optimizer.last_throughput / current_throughput;
> > +  int32_t last_attempts = optimizer.last_attempts;
> > +  int32_t current_attempts = optimizer.optimized_attempts;
> > +  int32_t new_attempts = current_attempts;
> > +  if (unlikely(change_for_worse > 1.40))
> > +    {
> > +      optimizer.optimized_attempts = optimizer.best_ever_attempts;
> > +      optimizer.last_throughput = current_throughput;
> > +      optimizer.last_attempts = current_attempts;
> > +      return;
> > +    }
> > +
> > +  if (unlikely(random() % 100 < 1))
> > +    {
>
> So where is the seed for that random stored? Could you corrupt some
> user's random state? Is the state per thread or global?
> If it's per thread how do you initialize so that they threads do
> start with different seeds.
> If it's global what synchronizes it?

As I do not specify any seed, I was under the impression that there
would be a default initialization. Furthermore, the posix
documentation specifies random() to be MT-safe, so I assumed its
internal state to be per-thread.
Did I mis-interpret this?

With regard to the self-tuning state, it is kept within the
"gtm_global_optimizer optimizer" struct, which is in essence
multi-reader (any thread running transactions can check the struct to
use the parameters optimized in it) and single-writer (notice that the
"void GTM::gtm_thread::reoptimize_htm_execution()" function is called
only by one thread, the one at the end of the list of threads, i.e.,
whose tx->next_thread == NULL).

>
> Overall the algorithm looks very complicated with many mysterious magic
> numbers. Are there simplifications possible? While the retry path is not
> extremely critical it should be at least somewhat optimized, otherwise
> it will dominate the cost of short transactions.
>
> One problems with so many magic numbers is that they may be good on one
> system, but bad on another.

Notice that the retry path is barely changed in the common case: only
a designated thread (the last one in the list of threads registered in
libitm) will periodically execute the re-optimization. Hence, most of
the patch that you can see here is code execute in that (uncommon
case).
I understand the concern with magic numbers: we could self-tune them
as well, but that would surely increase the complexity of the patch :)
In essence, we have the following numbers at the moment:
 * how often we re-optimize (every 500 successful transactions for the
designated thread)
 * how many maximum attempts we can have in hardware (20)
 * how much better and worse the performance must change for the
gradient descent to move to a new configuration (5%)
 * how terrible the performance must change for the gradient descent
to rollback to the best known configuration so far (40%)
 * how often the gradient descent can explore randomly to avoid local
optima (1%)

Once again, thank you for your time on this matter. Looking forward to
your comments.

-- Nuno Diegues

>
> -Andi
> --
> ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-08 15:29   ` Nuno Diegues
@ 2015-04-08 17:55     ` Andi Kleen
  2015-04-09 10:42       ` Nuno Diegues
  0 siblings, 1 reply; 20+ messages in thread
From: Andi Kleen @ 2015-04-08 17:55 UTC (permalink / raw)
  To: Nuno Diegues; +Cc: Andi Kleen, gcc-patches

> On the STAMP suite of benchmarks for transactional memory (described here [1]).
> I have ran an unmodified GCC 5.0.0 against the patched GCC with these
> modifications and obtain the following speedups in STAMP with 4
> threads (on a Haswell with 4 cores, average 10 runs):

I expect you'll need different tunings on larger systems.

> That is a good point. While I haven't ever used fixed point
> arithmetic, a cursory inspection reveals that it does make sense and
> seems applicable to this case.
> Are you aware of some place where this is being done already within
> GCC that I could use as inspiration, or should I craft some macros
> from scratch for this?

I believe the inliner uses fixed point. Own macros should be fine too.

> > > +  int32_t last_attempts = optimizer.last_attempts;
> > > +  int32_t current_attempts = optimizer.optimized_attempts;
> > > +  int32_t new_attempts = current_attempts;
> > > +  if (unlikely(change_for_worse > 1.40))
> > > +    {
> > > +      optimizer.optimized_attempts = optimizer.best_ever_attempts;
> > > +      optimizer.last_throughput = current_throughput;
> > > +      optimizer.last_attempts = current_attempts;
> > > +      return;
> > > +    }
> > > +
> > > +  if (unlikely(random() % 100 < 1))
> > > +    {
> >
> > So where is the seed for that random stored? Could you corrupt some
> > user's random state? Is the state per thread or global?
> > If it's per thread how do you initialize so that they threads do
> > start with different seeds.
> > If it's global what synchronizes it?
> 
> As I do not specify any seed, I was under the impression that there
> would be a default initialization. Furthermore, the posix
> documentation specifies random() to be MT-safe, so I assumed its
> internal state to be per-thread.
> Did I mis-interpret this?

Yes, that's right. But it's very nasty to change the users RNG state.
A common pattern for repeatable benchmarks is to start with srand(1) 
and then use the random numbers to run the benchmark, so it always does
the same thing. If you non deterministically (transaction aborts are not
deterministic) change the random state it will make the benchmark not
repeatable anymore.  You'll need to use an own RNG state that it independent.

It would be good to see if any parts of the algorithm can be
simplified. In general in production software the goal is to have
the simplest algorithm that does the job.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-08 17:55     ` Andi Kleen
@ 2015-04-09 10:42       ` Nuno Diegues
  2015-04-10 12:24         ` Andi Kleen
  0 siblings, 1 reply; 20+ messages in thread
From: Nuno Diegues @ 2015-04-09 10:42 UTC (permalink / raw)
  To: Andi Kleen; +Cc: gcc-patches

On Wed, Apr 8, 2015 at 6:54 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> On the STAMP suite of benchmarks for transactional memory (described here [1]).
>> I have ran an unmodified GCC 5.0.0 against the patched GCC with these
>> modifications and obtain the following speedups in STAMP with 4
>> threads (on a Haswell with 4 cores, average 10 runs):
>
> I expect you'll need different tunings on larger systems.

I did not quite understand the extent of your comment: what
specifically would need different tuning? The idea is exactly that
this proposal does not have any attachment to the workload/deployment;
there are some parameters (aka, the magic numbers we discussed) but
they are quite reasonable, i.e., each one of them has a sensible value
with some meaning we understand.


>
>> That is a good point. While I haven't ever used fixed point
>> arithmetic, a cursory inspection reveals that it does make sense and
>> seems applicable to this case.
>> Are you aware of some place where this is being done already within
>> GCC that I could use as inspiration, or should I craft some macros
>> from scratch for this?
>
> I believe the inliner uses fixed point. Own macros should be fine too.

Thanks, will try this out.


>
>> > > +  int32_t last_attempts = optimizer.last_attempts;
>> > > +  int32_t current_attempts = optimizer.optimized_attempts;
>> > > +  int32_t new_attempts = current_attempts;
>> > > +  if (unlikely(change_for_worse > 1.40))
>> > > +    {
>> > > +      optimizer.optimized_attempts = optimizer.best_ever_attempts;
>> > > +      optimizer.last_throughput = current_throughput;
>> > > +      optimizer.last_attempts = current_attempts;
>> > > +      return;
>> > > +    }
>> > > +
>> > > +  if (unlikely(random() % 100 < 1))
>> > > +    {
>> >
>> > So where is the seed for that random stored? Could you corrupt some
>> > user's random state? Is the state per thread or global?
>> > If it's per thread how do you initialize so that they threads do
>> > start with different seeds.
>> > If it's global what synchronizes it?
>>
>> As I do not specify any seed, I was under the impression that there
>> would be a default initialization. Furthermore, the posix
>> documentation specifies random() to be MT-safe, so I assumed its
>> internal state to be per-thread.
>> Did I mis-interpret this?
>
> Yes, that's right. But it's very nasty to change the users RNG state.
> A common pattern for repeatable benchmarks is to start with srand(1)
> and then use the random numbers to run the benchmark, so it always does
> the same thing. If you non deterministically (transaction aborts are not
> deterministic) change the random state it will make the benchmark not
> repeatable anymore.  You'll need to use an own RNG state that it independent.

I understand your concern, thanks for raising it.

One general question on how to proceed:
given that I make some further changes, should I post the whole patch again?


Best regards,
-- Nuno Diegues


>
> It would be good to see if any parts of the algorithm can be
> simplified. In general in production software the goal is to have
> the simplest algorithm that does the job.
>
> -Andi
> --
> ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-09 10:42       ` Nuno Diegues
@ 2015-04-10 12:24         ` Andi Kleen
  2015-04-30  3:52           ` Nuno Diegues
  0 siblings, 1 reply; 20+ messages in thread
From: Andi Kleen @ 2015-04-10 12:24 UTC (permalink / raw)
  To: Nuno Diegues; +Cc: gcc-patches

Nuno Diegues <nmld@ist.utl.pt> writes:
>
> One general question on how to proceed:
> given that I make some further changes, should I post the whole patch again?

Yes please resend the patch.

-Andi

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-10 12:24         ` Andi Kleen
@ 2015-04-30  3:52           ` Nuno Diegues
  2015-04-30  5:00             ` Andi Kleen
                               ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Nuno Diegues @ 2015-04-30  3:52 UTC (permalink / raw)
  To: Andi Kleen; +Cc: gcc-patches, Paolo Romano

Hello,

I have taken the chance to improve the patch by addressing the
comments above in this thread.
Namely:
 - to use a simple random generator managed inside the library only
 - removed floating point usage and replaced by fixed arithmetic
 - added some comments where relevant

Re-running the STAMP benchmarks shows similar gains (to those shown
above) with respect to an unmodified GCC 5.0.0:

benchmark: speedup
genome: 1.58
intruder: 1.78
labyrinth: 1.0
ssca2: 1.01
yada: 1.0
kmeans-high: 1.16
kmeans-low: 1.16
vacation-high: 2.05
vacation-low: 2.81

I appreciate any feedback and comments!

Index: libitm/libitm_i.h
===================================================================
--- libitm/libitm_i.h (revision 219316)
+++ libitm/libitm_i.h (working copy)
@@ -242,6 +242,9 @@ struct gtm_thread
   uint32_t restart_reason[NUM_RESTARTS];
   uint32_t restart_total;

+  // Keeps track of how many transactions were successfully executed.
+  uint64_t number_executed_txns;
+
   // *** The shared part of gtm_thread starts here. ***
   // Shared state is on separate cachelines to avoid false sharing with
   // thread-local parts of gtm_thread.
@@ -286,6 +289,8 @@ struct gtm_thread
   static void *operator new(size_t);
   static void operator delete(void *);

+  static void reoptimize_htm_execution();
+
   // Invoked from assembly language, thus the "asm" specifier on
   // the name, avoiding complex name mangling.
   static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
@@ -309,6 +314,59 @@ struct gtm_thread
   void commit_user_actions ();
 };

+// Different ways to deal with capacity aborts in HTM execution.
+enum gtm_capacity_abort_mode
+{
+  STUBBORN,
+  HALVEN,
+  GIVEUP
+};
+
+// Definition of fixed point arithmetic types.
+// Half the bits are dedicated to the fractional type, and the rest to the
+// "whole" part.
+#define FIXED_PT_WIDTH  64
+#define FIXED_PT_INTEGER_WIDTH  32
+typedef uint64_t fixed_pt_t;
+typedef __uint128_t fixed_pt_td;
+
+#define FIXED_PT_FRAC_WIDTH     (FIXED_PT_WIDTH - FIXED_PT_INTEGER_WIDTH)
+#define FIXED_PT_ONE    ((fixed_pt_t)((fixed_pt_t)1 << FIXED_PT_FRAC_WIDTH))
+#define FIXED_PT_TWO    (FIXED_PT_ONE + FIXED_PT_ONE)
+
+#define fixed_mul(A,B) \
+  ((fixed_pt_t)(((fixed_pt_td)(A) * (fixed_pt_td)(B)) >> FIXED_PT_FRAC_WIDTH))
+#define fixed_div(A,B) \
+  ((fixed_pt_t)(((fixed_pt_td)(A) << FIXED_PT_FRAC_WIDTH) / (fixed_pt_td)(B)))
+#define fixed_const(R) \
+  ((fixed_pt_t)((R) * FIXED_PT_ONE + ((R) >= 0 ? 0.5 : -0.5)))
+
+// Maintains the current values optimized for HTM execution and the
+// corresponding statistics gathered for the decision-making.
+struct gtm_global_optimizer
+{
+  // Mode chosen to currently deal with capacity aborts.
+  gtm_capacity_abort_mode optimized_mode;
+  // Number of attempts chosen to currently insist on HTM execution.
+  uint32_t optimized_attempts;
+
+  uint64_t last_cycles;
+  uint64_t last_total_txs_executed;
+
+  fixed_pt_t last_throughput;
+  uint32_t last_attempts;
+
+  fixed_pt_t best_ever_throughput;
+  uint32_t best_ever_attempts;
+
+  uint64_t txns_while_stubborn;
+  uint64_t cycles_while_stubborn;
+  uint64_t txns_while_halven;
+  uint64_t cycles_while_halven;
+  uint64_t txns_while_giveup;
+  uint64_t cycles_while_giveup;
+};
+
 } // namespace GTM

 #include "tls.h"
@@ -346,6 +404,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
 // the name, avoiding complex name mangling.
 extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");

+// Maintains the optimization for HTM execution.
+extern gtm_global_optimizer optimizer;
+
 } // namespace GTM

 #endif // LIBITM_I_H
Index: libitm/libitm.h
===================================================================
--- libitm/libitm.h (revision 219316)
+++ libitm/libitm.h (working copy)
@@ -101,7 +101,8 @@ typedef enum
    /* These are not part of the ABI but used for custom HTM fast paths.  See
       ITM_beginTransaction and gtm_thread::begin_transaction.  */
    pr_HTMRetryableAbort = 0x800000,
-   pr_HTMRetriedAfterAbort = 0x1000000
+   pr_HTMRetriedAfterAbort = 0x1000000,
+   pr_HTMCapacityAbort          = 0x2000000
 } _ITM_codeProperties;

 /* Result from startTransaction that describes what actions to take.
Index: libitm/method-serial.cc
===================================================================
--- libitm/method-serial.cc (revision 219316)
+++ libitm/method-serial.cc (working copy)
@@ -223,7 +223,23 @@ struct htm_mg : public method_group
     // initially disabled.
 #ifdef USE_HTM_FASTPATH
     htm_fastpath = htm_init();
+#ifdef HAVE_AS_RTM
+    optimizer.optimized_mode = STUBBORN;
+    optimizer.optimized_attempts = htm_fastpath;
+    optimizer.last_cycles = rdtsc();
+    optimizer.last_total_txs_executed = 0;
+    optimizer.last_throughput = fixed_const(0.0001);
+    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
+    optimizer.best_ever_throughput = optimizer.last_throughput;
+    optimizer.best_ever_attempts = htm_fastpath;
+    optimizer.txns_while_stubborn = 1;
+    optimizer.cycles_while_stubborn = 100;
+    optimizer.txns_while_halven = 1;
+    optimizer.cycles_while_halven = 100;
+    optimizer.txns_while_giveup = 1;
+    optimizer.cycles_while_giveup = 100;
 #endif
+#endif
   }
   virtual void fini()
   {
Index: libitm/beginend.cc
===================================================================
--- libitm/beginend.cc (revision 219316)
+++ libitm/beginend.cc (working copy)
@@ -25,7 +25,6 @@
 #include "libitm_i.h"
 #include <pthread.h>

-
 using namespace GTM;

 #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP)
@@ -39,6 +38,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
 gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
 atomic<gtm_version> GTM::gtm_clock;

+// Optimization of HTM executions.
+gtm_global_optimizer GTM::optimizer;
+
 /* ??? Move elsewhere when we figure out library initialization.  */
 uint64_t GTM::gtm_spin_count_var = 1000;

@@ -149,6 +151,225 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
     return a_runInstrumentedCode;
 }

+static inline fixed_pt_t
+fixed_sqrt(fixed_pt_t n)
+{
+    int invert = 0;
+    int iter = FIXED_PT_FRAC_WIDTH;
+    int l, i;
+
+    if (n == 0 || n == FIXED_PT_ONE)
+      {
+        return n;
+      }
+    if (n < FIXED_PT_ONE && n > 6)
+      {
+        invert = 1;
+        n = fixed_div(FIXED_PT_ONE, n);
+      }
+    if (n > FIXED_PT_ONE)
+      {
+        int s = n;
+        iter = 0;
+        while (s > 0)
+          {
+            s >>= 2;
+            iter++;
+          }
+      }
+
+    l = (n >> 1) + 1;
+    for (i = 0; i < iter; i++)
+      {
+        l = (l + fixed_div(n, l)) >> 1;
+      }
+    if (invert)
+      {
+        return (fixed_div(FIXED_PT_ONE, l));
+      }
+    return l;
+}
+
+static inline fixed_pt_t
+fixed_ln(fixed_pt_t x)
+{
+  const fixed_pt_t LN2 = fixed_const(0.69314718055994530942);
+  const fixed_pt_t LG[7] = {
+    fixed_const(6.666666666666735130e-01),
+    fixed_const(3.999999999940941908e-01),
+    fixed_const(2.857142874366239149e-01),
+    fixed_const(2.222219843214978396e-01),
+    fixed_const(1.818357216161805012e-01),
+    fixed_const(1.531383769920937332e-01),
+    fixed_const(1.479819860511658591e-01)
+  };
+
+  if (x == 0)
+    {
+      return 0xffffffff;
+    }
+
+  fixed_pt_t log2 = 0;
+  fixed_pt_t xi = x;
+  while (xi > FIXED_PT_TWO)
+    {
+      xi >>= 1;
+      log2++;
+    }
+
+  fixed_pt_t f = xi - FIXED_PT_ONE;
+  fixed_pt_t s = fixed_div(f, FIXED_PT_TWO + f);
+  fixed_pt_t z = fixed_mul(s, s);
+  fixed_pt_t w = fixed_mul(z, z);
+  fixed_pt_t R =
+    fixed_mul(w, LG[1] + fixed_mul(w, LG[3] + fixed_mul(w, LG[5])))
+    + fixed_mul(z, LG[0] + fixed_mul(w, LG[2]
+        + fixed_mul(w, LG[4] + fixed_mul(w, LG[6]))));
+  return fixed_mul(LN2, (log2 << FIXED_PT_FRAC_WIDTH))
+    + f - fixed_mul(s, f - R);
+}
+
+// State not synchronized; assumes single thread usage at any time.
+// Invoked only by the thread reoptimizing, so assumption holds.
+int
+obtainRandomInt(int max)
+{
+  static int seed = 123456789;
+  seed = (1103515245 * seed + 12345) % 4294967296;
+  return seed;
+}
+
+// Called by the thread at the tail of the list of threads.
+void
+GTM::gtm_thread::reoptimize_htm_execution()
+{
+  gtm_thread *tx = gtm_thr();
+  uint64_t total_txs_executed = 0;
+
+  // Collect the statistics obtained so far.
+  serial_lock.read_lock(tx);
+  gtm_thread *ptr = list_of_threads;
+  for (; ptr; ptr = ptr->next_thread)
+    {
+      total_txs_executed += ptr->number_executed_txns;
+    }
+  serial_lock.read_unlock(tx);
+
+  // Obtain the delta performance with respect to the last period.
+  const uint64_t current_cycles = rdtsc();
+  const uint64_t cycles_used = current_cycles - optimizer.last_cycles;
+  const uint64_t new_txs_executed =
+    total_txs_executed - optimizer.last_total_txs_executed;
+  optimizer.last_cycles = current_cycles;
+  optimizer.last_total_txs_executed = total_txs_executed;
+  if (optimizer.optimized_mode == STUBBORN)
+    {
+      optimizer.txns_while_stubborn += new_txs_executed;
+      optimizer.cycles_while_stubborn += cycles_used;
+    }
+  else if (optimizer.optimized_mode == HALVEN)
+    {
+      optimizer.txns_while_halven += new_txs_executed;
+      optimizer.cycles_while_halven += cycles_used;
+    }
+  else
+    {
+      optimizer.txns_while_giveup += new_txs_executed;
+      optimizer.cycles_while_giveup += cycles_used;
+    }
+
+  // Compute Upper Confidence Bounds for the mode to choose next.
+  // Use fixed point arithmetic types to spare floating point usage.
+  const fixed_pt_t log_sum =
+    fixed_mul(FIXED_PT_TWO,
+      fixed_ln(fixed_const(optimizer.txns_while_stubborn)
+               + fixed_const(optimizer.txns_while_halven)
+               + fixed_const(optimizer.txns_while_giveup)));
+  const fixed_pt_t ucb_stubborn =
+    fixed_div(fixed_div(fixed_const(optimizer.txns_while_stubborn),
+                        fixed_const(optimizer.cycles_while_stubborn)),
+              optimizer.best_ever_throughput)
+    + fixed_sqrt(fixed_div(log_sum,
+                           fixed_const(optimizer.txns_while_stubborn)));
+  const fixed_pt_t ucb_halven =
+    fixed_div(fixed_div(fixed_const(optimizer.txns_while_halven),
+                        fixed_const(optimizer.cycles_while_halven)),
+              optimizer.best_ever_throughput)
+    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_halven)));
+  const fixed_pt_t ucb_giveup =
+    fixed_div(fixed_div(fixed_const(optimizer.txns_while_giveup),
+                        fixed_const(optimizer.cycles_while_giveup)),
+              optimizer.best_ever_throughput)
+    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_giveup)));
+
+  if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup)
+    {
+      optimizer.optimized_mode = STUBBORN;
+    }
+  else if (ucb_halven > ucb_giveup)
+    {
+      optimizer.optimized_mode = HALVEN;
+    }
+  else
+    {
+      optimizer.optimized_mode = GIVEUP;
+    }
+
+  // Compute gradient descent for the number of retries.
+  const fixed_pt_t current_throughput =
+    fixed_div(fixed_const(new_txs_executed), fixed_const(cycles_used));
+  const fixed_pt_t change_for_better =
+    fixed_div(current_throughput, optimizer.last_throughput);
+  const fixed_pt_t change_for_worse =
+    fixed_div(optimizer.last_throughput, current_throughput);
+  int32_t last_attempts = optimizer.last_attempts;
+  int32_t current_attempts = optimizer.optimized_attempts;
+  int32_t new_attempts = current_attempts;
+  if (unlikely(change_for_worse > fixed_const(1.40)))
+    {
+      optimizer.optimized_attempts = optimizer.best_ever_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.last_attempts = current_attempts;
+      return;
+    }
+
+  if (unlikely(obtainRandomInt(100) == 0))
+    {
+      optimizer.last_attempts = optimizer.optimized_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.optimized_attempts = obtainRandomInt(18) + 2;
+      return;
+    }
+
+      if (change_for_better > fixed_const(1.05))
+        {
+          new_attempts += current_attempts - last_attempts;
+          if (current_attempts == last_attempts)
+            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
+        }
+      else if (change_for_worse > fixed_const(1.05))
+        {
+          new_attempts -= current_attempts - last_attempts;
+          if (current_attempts == last_attempts)
+            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
+        }
+      if (unlikely(new_attempts > 20))
+        new_attempts = 20;
+      else if (unlikely(new_attempts < 2))
+        new_attempts = 2;
+      if (current_attempts != new_attempts)
+        {
+          optimizer.last_attempts = current_attempts;
+          optimizer.last_throughput = current_throughput;
+        }
+      optimizer.optimized_attempts = new_attempts;
+      if (unlikely(current_throughput > optimizer.best_ever_throughput))
+        {
+          optimizer.best_ever_throughput = current_throughput;
+          optimizer.best_ever_attempts = current_attempts;
+        }
+}
+
 uint32_t
 GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 {
@@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
 #ifndef HTM_CUSTOM_FASTPATH
   if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
     {
-      for (uint32_t t = htm_fastpath; t; t--)
+      for (uint32_t t = optimizer.optimized_attempts; t; t--)
  {
   uint32_t ret = htm_begin();
   if (htm_begin_success(ret))
@@ -209,19 +430,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
     }
   // The transaction has aborted.  Don't retry if it's unlikely that
   // retrying the transaction will be successful.
-  if (!htm_abort_should_retry(ret))
+#ifdef HAVE_AS_RTM
+          if (htm_is_capacity_abort(ret))
+            {
+              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+              if (capacity_mode == HALVEN)
+                t = t / 2 + 1;
+              else if (capacity_mode == GIVEUP)
+                t = 1;
+            }
+          // Only one thread performs this to avoid the need for
+          // synchronization. We use the oldest thread that is active.
+          // We also reoptimize at most once per transaction.
+          if (unlikely(tx->next_thread == NULL &&
+              tx->number_executed_txns % 500 == 0 && t == 1))
+              reoptimize_htm_execution();
+#else
+          if (!htm_abort_should_retry(ret))
     break;
+#endif
   // Wait until any concurrent serial-mode transactions have finished.
   // This is an empty critical section, but won't be elided.
   if (serial_lock.is_write_locked())
     {
-      tx = gtm_thr();
-      if (unlikely(tx == NULL))
-        {
-          // See below.
-          tx = new gtm_thread();
-          set_gtm_thr(tx);
-        }
+              tx = gtm_thr();
+              if (unlikely(tx == NULL))
+                {
+                  // See below.
+                  tx = new gtm_thread();
+                  set_gtm_thr(tx);
+                }
       // Check whether there is an enclosing serial-mode transaction;
       // if so, we just continue as a nested transaction and don't
       // try to use the HTM fastpath.  This case can happen when an
@@ -262,12 +500,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
       // other fallback will use serial transactions, which don't use
       // restart_total but will reset it when committing.
       if (!(prop & pr_HTMRetriedAfterAbort))
- tx->restart_total = htm_fastpath;
+        tx->restart_total = optimizer.optimized_attempts;

       if (--tx->restart_total > 0)
  {
   // Wait until any concurrent serial-mode transactions have finished.
   // Essentially the same code as above.
+#ifdef HAVE_AS_RTM
+          if (prop & pr_HTMCapacityAbort)
+            {
+              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+              if (capacity_mode == HALVEN)
+                tx->restart_total = tx->restart_total;
+              else if (capacity_mode == GIVEUP)
+                goto stop_custom_htm_fastpath;
+            }
+
+          if (unlikely(tx->next_thread == NULL &&
+              tx->number_executed_txns % 500 == 0 && tx->restart_total == 1))
+              reoptimize_htm_execution();
+#endif
   if (serial_lock.is_write_locked())
     {
       if (tx->nesting > 0)
@@ -665,12 +917,21 @@ _ITM_commitTransaction(void)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+        {
+          tx = new gtm_thread();
+          set_gtm_thr(tx);
+        }
+      tx->number_executed_txns++;
       return;
     }
 #endif
   gtm_thread *tx = gtm_thr();
   if (!tx->trycommit ())
     tx->restart (RESTART_VALIDATE_COMMIT);
+
+  tx->number_executed_txns++;
 }

 void ITM_REGPARM
@@ -681,6 +942,13 @@ _ITM_commitTransactionEH(void *exc_ptr)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+        {
+          tx = new gtm_thread();
+          set_gtm_thr(tx);
+        }
+      tx->number_executed_txns++;
       return;
     }
 #endif
@@ -690,4 +958,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
       tx->eh_in_flight = exc_ptr;
       tx->restart (RESTART_VALIDATE_COMMIT);
     }
+  tx->number_executed_txns++;
 }
Index: libitm/config/x86/target.h
===================================================================
--- libitm/config/x86/target.h (revision 219316)
+++ libitm/config/x86/target.h (working copy)
@@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
   return begin_ret & _XABORT_RETRY;
 }

+static inline bool
+htm_is_capacity_abort(uint32_t begin_ret)
+{
+  return begin_ret & _XABORT_CAPACITY;
+}
+
 /* Returns true iff a hardware transaction is currently being executed.  */
 static inline bool
 htm_transaction_active ()
 {
   return _xtest() != 0;
 }
+
+static inline uint64_t rdtsc()
+{
+    uint32_t hi, lo;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
 #endif


Index: libitm/config/x86/sjlj.S
===================================================================
--- libitm/config/x86/sjlj.S (revision 219316)
+++ libitm/config/x86/sjlj.S (working copy)
@@ -59,12 +59,14 @@
 #define pr_hasNoAbort 0x08
 #define pr_HTMRetryableAbort 0x800000
 #define pr_HTMRetriedAfterAbort 0x1000000
+#define pr_HTMCapacityAbort     0x2000000
 #define a_runInstrumentedCode 0x01
 #define a_runUninstrumentedCode 0x02
 #define a_tryHTMFastPath 0x20

 #define _XABORT_EXPLICIT (1 << 0)
 #define _XABORT_RETRY (1 << 1)
+#define _XABORT_CAPACITY (1 << 3)

  .text

@@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction):
 .Ltxn_abort:
  /* If it might make sense to retry the HTM fast path, let the C++
    code decide.  */
- testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
+ testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax
  jz .Lno_htm
  orl $pr_HTMRetryableAbort, %edi
+ testl   $(_XABORT_CAPACITY), %eax
+ jz      .Lno_htm
+ orl     $pr_HTMCapacityAbort, %edi
  /* Let the C++ code handle the retry policy.  */
 .Lno_htm:
 #endif




On Fri, Apr 10, 2015 at 8:24 AM, Andi Kleen <andi@firstfloor.org> wrote:
> Nuno Diegues <nmld@ist.utl.pt> writes:
>>
>> One general question on how to proceed:
>> given that I make some further changes, should I post the whole patch again?
>
> Yes please resend the patch.
>
> -Andi

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-30  3:52           ` Nuno Diegues
@ 2015-04-30  5:00             ` Andi Kleen
  2015-04-30 12:13               ` Nuno Diegues
  2015-05-14 22:21             ` [PING] " Andi Kleen
  2015-05-18 21:31             ` Torvald Riegel
  2 siblings, 1 reply; 20+ messages in thread
From: Andi Kleen @ 2015-04-30  5:00 UTC (permalink / raw)
  To: Nuno Diegues; +Cc: gcc-patches, Paolo Romano

Nuno Diegues <nmld@ist.utl.pt> writes:

> Hello,
>
> I have taken the chance to improve the patch by addressing the
> comments above in this thread.
> Namely:
>  - to use a simple random generator managed inside the library only
>  - removed floating point usage and replaced by fixed arithmetic
>  - added some comments where relevant

Thanks.

Patch looks good to me now. It would be perhaps nice to have an
environment variable to turn the adaptive algorithm off for tests,
but that's not critical.

It would be also nice to test it on something else, but I understand
it's difficult to find other software using the STM syntax.

I can't approve the patch however. I believe it's big enough that you
may need a copy right assignment.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-30  5:00             ` Andi Kleen
@ 2015-04-30 12:13               ` Nuno Diegues
  2015-05-05 12:49                 ` Nuno Diegues
  0 siblings, 1 reply; 20+ messages in thread
From: Nuno Diegues @ 2015-04-30 12:13 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andi Kleen, Paolo Romano

> Patch looks good to me now. It would be perhaps nice to have an
> environment variable to turn the adaptive algorithm off for tests,
> but that's not critical.

Yes, that makes perfect sense.

> It would be also nice to test it on something else, but I understand
> it's difficult to find other software using the STM syntax.

Indeed. I'll try to find some time to work on that, but it may take a while.

> I can't approve the patch however. I believe it's big enough that you
> may need a copy right assignment.

I have signed a Form Assignment from the Free Software Foundation to
deal exactly with those matters for this patch to the libitm. Torvald
Riegel had advised me to do so.

I have not, however, received any further information; so I'm left
wondering if it went through or if it is still hanging. I will ping
back to FSF to check that out perhaps?

Best regards,
-- Nuno Diegues

>
> -Andi
>
> --
> ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-30 12:13               ` Nuno Diegues
@ 2015-05-05 12:49                 ` Nuno Diegues
  0 siblings, 0 replies; 20+ messages in thread
From: Nuno Diegues @ 2015-05-05 12:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andi Kleen, Paolo Romano

Today I have received the news that the Copyright Assignment was
completed with the FSF.


On Thu, Apr 30, 2015 at 8:10 AM, Nuno Diegues <nmld@ist.utl.pt> wrote:
>
> > Patch looks good to me now. It would be perhaps nice to have an
> > environment variable to turn the adaptive algorithm off for tests,
> > but that's not critical.
>
> Yes, that makes perfect sense.
>
>
> > It would be also nice to test it on something else, but I understand
> > it's difficult to find other software using the STM syntax.
>
> Indeed. I'll try to find some time to work on that, but it may take a while.
>
>
> > I can't approve the patch however. I believe it's big enough that you
> > may need a copy right assignment.
>
> I have signed a Form Assignment from the Free Software Foundation to
> deal exactly with those matters for this patch to the libitm. Torvald
> Riegel had advised me to do so.
>
> I have not, however, received any further information; so I'm left
> wondering if it went through or if it is still hanging. I will ping
> back to FSF to check that out perhaps?
>
>
> Best regards,
> -- Nuno Diegues
>
>
>
> >
> > -Andi
> >
> > --
> > ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PING] Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-30  3:52           ` Nuno Diegues
  2015-04-30  5:00             ` Andi Kleen
@ 2015-05-14 22:21             ` Andi Kleen
  2015-05-18 21:31             ` Torvald Riegel
  2 siblings, 0 replies; 20+ messages in thread
From: Andi Kleen @ 2015-05-14 22:21 UTC (permalink / raw)
  To: Nuno Diegues; +Cc: gcc-patches, Paolo Romano


Ping!

Could someone who can approve please review the patch?

Thanks,
-Andi

Nuno Diegues <nmld@ist.utl.pt> writes:

> Hello,
>
> I have taken the chance to improve the patch by addressing the
> comments above in this thread.
> Namely:
>  - to use a simple random generator managed inside the library only
>  - removed floating point usage and replaced by fixed arithmetic
>  - added some comments where relevant
>
> Re-running the STAMP benchmarks shows similar gains (to those shown
> above) with respect to an unmodified GCC 5.0.0:
>
> benchmark: speedup
> genome: 1.58
> intruder: 1.78
> labyrinth: 1.0
> ssca2: 1.01
> yada: 1.0
> kmeans-high: 1.16
> kmeans-low: 1.16
> vacation-high: 2.05
> vacation-low: 2.81
>
> I appreciate any feedback and comments!
>
> Index: libitm/libitm_i.h
> ===================================================================
> --- libitm/libitm_i.h (revision 219316)
> +++ libitm/libitm_i.h (working copy)
> @@ -242,6 +242,9 @@ struct gtm_thread
>    uint32_t restart_reason[NUM_RESTARTS];
>    uint32_t restart_total;
>
> +  // Keeps track of how many transactions were successfully executed.
> +  uint64_t number_executed_txns;
> +
>    // *** The shared part of gtm_thread starts here. ***
>    // Shared state is on separate cachelines to avoid false sharing with
>    // thread-local parts of gtm_thread.
> @@ -286,6 +289,8 @@ struct gtm_thread
>    static void *operator new(size_t);
>    static void operator delete(void *);
>
> +  static void reoptimize_htm_execution();
> +
>    // Invoked from assembly language, thus the "asm" specifier on
>    // the name, avoiding complex name mangling.
>    static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
> @@ -309,6 +314,59 @@ struct gtm_thread
>    void commit_user_actions ();
>  };
>
> +// Different ways to deal with capacity aborts in HTM execution.
> +enum gtm_capacity_abort_mode
> +{
> +  STUBBORN,
> +  HALVEN,
> +  GIVEUP
> +};
> +
> +// Definition of fixed point arithmetic types.
> +// Half the bits are dedicated to the fractional type, and the rest to the
> +// "whole" part.
> +#define FIXED_PT_WIDTH  64
> +#define FIXED_PT_INTEGER_WIDTH  32
> +typedef uint64_t fixed_pt_t;
> +typedef __uint128_t fixed_pt_td;
> +
> +#define FIXED_PT_FRAC_WIDTH     (FIXED_PT_WIDTH - FIXED_PT_INTEGER_WIDTH)
> +#define FIXED_PT_ONE    ((fixed_pt_t)((fixed_pt_t)1 << FIXED_PT_FRAC_WIDTH))
> +#define FIXED_PT_TWO    (FIXED_PT_ONE + FIXED_PT_ONE)
> +
> +#define fixed_mul(A,B) \
> +  ((fixed_pt_t)(((fixed_pt_td)(A) * (fixed_pt_td)(B)) >> FIXED_PT_FRAC_WIDTH))
> +#define fixed_div(A,B) \
> +  ((fixed_pt_t)(((fixed_pt_td)(A) << FIXED_PT_FRAC_WIDTH) / (fixed_pt_td)(B)))
> +#define fixed_const(R) \
> +  ((fixed_pt_t)((R) * FIXED_PT_ONE + ((R) >= 0 ? 0.5 : -0.5)))
> +
> +// Maintains the current values optimized for HTM execution and the
> +// corresponding statistics gathered for the decision-making.
> +struct gtm_global_optimizer
> +{
> +  // Mode chosen to currently deal with capacity aborts.
> +  gtm_capacity_abort_mode optimized_mode;
> +  // Number of attempts chosen to currently insist on HTM execution.
> +  uint32_t optimized_attempts;
> +
> +  uint64_t last_cycles;
> +  uint64_t last_total_txs_executed;
> +
> +  fixed_pt_t last_throughput;
> +  uint32_t last_attempts;
> +
> +  fixed_pt_t best_ever_throughput;
> +  uint32_t best_ever_attempts;
> +
> +  uint64_t txns_while_stubborn;
> +  uint64_t cycles_while_stubborn;
> +  uint64_t txns_while_halven;
> +  uint64_t cycles_while_halven;
> +  uint64_t txns_while_giveup;
> +  uint64_t cycles_while_giveup;
> +};
> +
>  } // namespace GTM
>
>  #include "tls.h"
> @@ -346,6 +404,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
>  // the name, avoiding complex name mangling.
>  extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");
>
> +// Maintains the optimization for HTM execution.
> +extern gtm_global_optimizer optimizer;
> +
>  } // namespace GTM
>
>  #endif // LIBITM_I_H
> Index: libitm/libitm.h
> ===================================================================
> --- libitm/libitm.h (revision 219316)
> +++ libitm/libitm.h (working copy)
> @@ -101,7 +101,8 @@ typedef enum
>     /* These are not part of the ABI but used for custom HTM fast paths.  See
>        ITM_beginTransaction and gtm_thread::begin_transaction.  */
>     pr_HTMRetryableAbort = 0x800000,
> -   pr_HTMRetriedAfterAbort = 0x1000000
> +   pr_HTMRetriedAfterAbort = 0x1000000,
> +   pr_HTMCapacityAbort          = 0x2000000
>  } _ITM_codeProperties;
>
>  /* Result from startTransaction that describes what actions to take.
> Index: libitm/method-serial.cc
> ===================================================================
> --- libitm/method-serial.cc (revision 219316)
> +++ libitm/method-serial.cc (working copy)
> @@ -223,7 +223,23 @@ struct htm_mg : public method_group
>      // initially disabled.
>  #ifdef USE_HTM_FASTPATH
>      htm_fastpath = htm_init();
> +#ifdef HAVE_AS_RTM
> +    optimizer.optimized_mode = STUBBORN;
> +    optimizer.optimized_attempts = htm_fastpath;
> +    optimizer.last_cycles = rdtsc();
> +    optimizer.last_total_txs_executed = 0;
> +    optimizer.last_throughput = fixed_const(0.0001);
> +    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
> +    optimizer.best_ever_throughput = optimizer.last_throughput;
> +    optimizer.best_ever_attempts = htm_fastpath;
> +    optimizer.txns_while_stubborn = 1;
> +    optimizer.cycles_while_stubborn = 100;
> +    optimizer.txns_while_halven = 1;
> +    optimizer.cycles_while_halven = 100;
> +    optimizer.txns_while_giveup = 1;
> +    optimizer.cycles_while_giveup = 100;
>  #endif
> +#endif
>    }
>    virtual void fini()
>    {
> Index: libitm/beginend.cc
> ===================================================================
> --- libitm/beginend.cc (revision 219316)
> +++ libitm/beginend.cc (working copy)
> @@ -25,7 +25,6 @@
>  #include "libitm_i.h"
>  #include <pthread.h>
>
> -
>  using namespace GTM;
>
>  #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP)
> @@ -39,6 +38,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
>  gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
>  atomic<gtm_version> GTM::gtm_clock;
>
> +// Optimization of HTM executions.
> +gtm_global_optimizer GTM::optimizer;
> +
>  /* ??? Move elsewhere when we figure out library initialization.  */
>  uint64_t GTM::gtm_spin_count_var = 1000;
>
> @@ -149,6 +151,225 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
>      return a_runInstrumentedCode;
>  }
>
> +static inline fixed_pt_t
> +fixed_sqrt(fixed_pt_t n)
> +{
> +    int invert = 0;
> +    int iter = FIXED_PT_FRAC_WIDTH;
> +    int l, i;
> +
> +    if (n == 0 || n == FIXED_PT_ONE)
> +      {
> +        return n;
> +      }
> +    if (n < FIXED_PT_ONE && n > 6)
> +      {
> +        invert = 1;
> +        n = fixed_div(FIXED_PT_ONE, n);
> +      }
> +    if (n > FIXED_PT_ONE)
> +      {
> +        int s = n;
> +        iter = 0;
> +        while (s > 0)
> +          {
> +            s >>= 2;
> +            iter++;
> +          }
> +      }
> +
> +    l = (n >> 1) + 1;
> +    for (i = 0; i < iter; i++)
> +      {
> +        l = (l + fixed_div(n, l)) >> 1;
> +      }
> +    if (invert)
> +      {
> +        return (fixed_div(FIXED_PT_ONE, l));
> +      }
> +    return l;
> +}
> +
> +static inline fixed_pt_t
> +fixed_ln(fixed_pt_t x)
> +{
> +  const fixed_pt_t LN2 = fixed_const(0.69314718055994530942);
> +  const fixed_pt_t LG[7] = {
> +    fixed_const(6.666666666666735130e-01),
> +    fixed_const(3.999999999940941908e-01),
> +    fixed_const(2.857142874366239149e-01),
> +    fixed_const(2.222219843214978396e-01),
> +    fixed_const(1.818357216161805012e-01),
> +    fixed_const(1.531383769920937332e-01),
> +    fixed_const(1.479819860511658591e-01)
> +  };
> +
> +  if (x == 0)
> +    {
> +      return 0xffffffff;
> +    }
> +
> +  fixed_pt_t log2 = 0;
> +  fixed_pt_t xi = x;
> +  while (xi > FIXED_PT_TWO)
> +    {
> +      xi >>= 1;
> +      log2++;
> +    }
> +
> +  fixed_pt_t f = xi - FIXED_PT_ONE;
> +  fixed_pt_t s = fixed_div(f, FIXED_PT_TWO + f);
> +  fixed_pt_t z = fixed_mul(s, s);
> +  fixed_pt_t w = fixed_mul(z, z);
> +  fixed_pt_t R =
> +    fixed_mul(w, LG[1] + fixed_mul(w, LG[3] + fixed_mul(w, LG[5])))
> +    + fixed_mul(z, LG[0] + fixed_mul(w, LG[2]
> +        + fixed_mul(w, LG[4] + fixed_mul(w, LG[6]))));
> +  return fixed_mul(LN2, (log2 << FIXED_PT_FRAC_WIDTH))
> +    + f - fixed_mul(s, f - R);
> +}
> +
> +// State not synchronized; assumes single thread usage at any time.
> +// Invoked only by the thread reoptimizing, so assumption holds.
> +int
> +obtainRandomInt(int max)
> +{
> +  static int seed = 123456789;
> +  seed = (1103515245 * seed + 12345) % 4294967296;
> +  return seed;
> +}
> +
> +// Called by the thread at the tail of the list of threads.
> +void
> +GTM::gtm_thread::reoptimize_htm_execution()
> +{
> +  gtm_thread *tx = gtm_thr();
> +  uint64_t total_txs_executed = 0;
> +
> +  // Collect the statistics obtained so far.
> +  serial_lock.read_lock(tx);
> +  gtm_thread *ptr = list_of_threads;
> +  for (; ptr; ptr = ptr->next_thread)
> +    {
> +      total_txs_executed += ptr->number_executed_txns;
> +    }
> +  serial_lock.read_unlock(tx);
> +
> +  // Obtain the delta performance with respect to the last period.
> +  const uint64_t current_cycles = rdtsc();
> +  const uint64_t cycles_used = current_cycles - optimizer.last_cycles;
> +  const uint64_t new_txs_executed =
> +    total_txs_executed - optimizer.last_total_txs_executed;
> +  optimizer.last_cycles = current_cycles;
> +  optimizer.last_total_txs_executed = total_txs_executed;
> +  if (optimizer.optimized_mode == STUBBORN)
> +    {
> +      optimizer.txns_while_stubborn += new_txs_executed;
> +      optimizer.cycles_while_stubborn += cycles_used;
> +    }
> +  else if (optimizer.optimized_mode == HALVEN)
> +    {
> +      optimizer.txns_while_halven += new_txs_executed;
> +      optimizer.cycles_while_halven += cycles_used;
> +    }
> +  else
> +    {
> +      optimizer.txns_while_giveup += new_txs_executed;
> +      optimizer.cycles_while_giveup += cycles_used;
> +    }
> +
> +  // Compute Upper Confidence Bounds for the mode to choose next.
> +  // Use fixed point arithmetic types to spare floating point usage.
> +  const fixed_pt_t log_sum =
> +    fixed_mul(FIXED_PT_TWO,
> +      fixed_ln(fixed_const(optimizer.txns_while_stubborn)
> +               + fixed_const(optimizer.txns_while_halven)
> +               + fixed_const(optimizer.txns_while_giveup)));
> +  const fixed_pt_t ucb_stubborn =
> +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_stubborn),
> +                        fixed_const(optimizer.cycles_while_stubborn)),
> +              optimizer.best_ever_throughput)
> +    + fixed_sqrt(fixed_div(log_sum,
> +                           fixed_const(optimizer.txns_while_stubborn)));
> +  const fixed_pt_t ucb_halven =
> +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_halven),
> +                        fixed_const(optimizer.cycles_while_halven)),
> +              optimizer.best_ever_throughput)
> +    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_halven)));
> +  const fixed_pt_t ucb_giveup =
> +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_giveup),
> +                        fixed_const(optimizer.cycles_while_giveup)),
> +              optimizer.best_ever_throughput)
> +    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_giveup)));
> +
> +  if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup)
> +    {
> +      optimizer.optimized_mode = STUBBORN;
> +    }
> +  else if (ucb_halven > ucb_giveup)
> +    {
> +      optimizer.optimized_mode = HALVEN;
> +    }
> +  else
> +    {
> +      optimizer.optimized_mode = GIVEUP;
> +    }
> +
> +  // Compute gradient descent for the number of retries.
> +  const fixed_pt_t current_throughput =
> +    fixed_div(fixed_const(new_txs_executed), fixed_const(cycles_used));
> +  const fixed_pt_t change_for_better =
> +    fixed_div(current_throughput, optimizer.last_throughput);
> +  const fixed_pt_t change_for_worse =
> +    fixed_div(optimizer.last_throughput, current_throughput);
> +  int32_t last_attempts = optimizer.last_attempts;
> +  int32_t current_attempts = optimizer.optimized_attempts;
> +  int32_t new_attempts = current_attempts;
> +  if (unlikely(change_for_worse > fixed_const(1.40)))
> +    {
> +      optimizer.optimized_attempts = optimizer.best_ever_attempts;
> +      optimizer.last_throughput = current_throughput;
> +      optimizer.last_attempts = current_attempts;
> +      return;
> +    }
> +
> +  if (unlikely(obtainRandomInt(100) == 0))
> +    {
> +      optimizer.last_attempts = optimizer.optimized_attempts;
> +      optimizer.last_throughput = current_throughput;
> +      optimizer.optimized_attempts = obtainRandomInt(18) + 2;
> +      return;
> +    }
> +
> +      if (change_for_better > fixed_const(1.05))
> +        {
> +          new_attempts += current_attempts - last_attempts;
> +          if (current_attempts == last_attempts)
> +            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
> +        }
> +      else if (change_for_worse > fixed_const(1.05))
> +        {
> +          new_attempts -= current_attempts - last_attempts;
> +          if (current_attempts == last_attempts)
> +            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
> +        }
> +      if (unlikely(new_attempts > 20))
> +        new_attempts = 20;
> +      else if (unlikely(new_attempts < 2))
> +        new_attempts = 2;
> +      if (current_attempts != new_attempts)
> +        {
> +          optimizer.last_attempts = current_attempts;
> +          optimizer.last_throughput = current_throughput;
> +        }
> +      optimizer.optimized_attempts = new_attempts;
> +      if (unlikely(current_throughput > optimizer.best_ever_throughput))
> +        {
> +          optimizer.best_ever_throughput = current_throughput;
> +          optimizer.best_ever_attempts = current_attempts;
> +        }
> +}
> +
>  uint32_t
>  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
>  {
> @@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>  #ifndef HTM_CUSTOM_FASTPATH
>    if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
>      {
> -      for (uint32_t t = htm_fastpath; t; t--)
> +      for (uint32_t t = optimizer.optimized_attempts; t; t--)
>   {
>    uint32_t ret = htm_begin();
>    if (htm_begin_success(ret))
> @@ -209,19 +430,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>      }
>    // The transaction has aborted.  Don't retry if it's unlikely that
>    // retrying the transaction will be successful.
> -  if (!htm_abort_should_retry(ret))
> +#ifdef HAVE_AS_RTM
> +          if (htm_is_capacity_abort(ret))
> +            {
> +              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
> +              if (capacity_mode == HALVEN)
> +                t = t / 2 + 1;
> +              else if (capacity_mode == GIVEUP)
> +                t = 1;
> +            }
> +          // Only one thread performs this to avoid the need for
> +          // synchronization. We use the oldest thread that is active.
> +          // We also reoptimize at most once per transaction.
> +          if (unlikely(tx->next_thread == NULL &&
> +              tx->number_executed_txns % 500 == 0 && t == 1))
> +              reoptimize_htm_execution();
> +#else
> +          if (!htm_abort_should_retry(ret))
>      break;
> +#endif
>    // Wait until any concurrent serial-mode transactions have finished.
>    // This is an empty critical section, but won't be elided.
>    if (serial_lock.is_write_locked())
>      {
> -      tx = gtm_thr();
> -      if (unlikely(tx == NULL))
> -        {
> -          // See below.
> -          tx = new gtm_thread();
> -          set_gtm_thr(tx);
> -        }
> +              tx = gtm_thr();
> +              if (unlikely(tx == NULL))
> +                {
> +                  // See below.
> +                  tx = new gtm_thread();
> +                  set_gtm_thr(tx);
> +                }
>        // Check whether there is an enclosing serial-mode transaction;
>        // if so, we just continue as a nested transaction and don't
>        // try to use the HTM fastpath.  This case can happen when an
> @@ -262,12 +500,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>        // other fallback will use serial transactions, which don't use
>        // restart_total but will reset it when committing.
>        if (!(prop & pr_HTMRetriedAfterAbort))
> - tx->restart_total = htm_fastpath;
> +        tx->restart_total = optimizer.optimized_attempts;
>
>        if (--tx->restart_total > 0)
>   {
>    // Wait until any concurrent serial-mode transactions have finished.
>    // Essentially the same code as above.
> +#ifdef HAVE_AS_RTM
> +          if (prop & pr_HTMCapacityAbort)
> +            {
> +              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
> +              if (capacity_mode == HALVEN)
> +                tx->restart_total = tx->restart_total;
> +              else if (capacity_mode == GIVEUP)
> +                goto stop_custom_htm_fastpath;
> +            }
> +
> +          if (unlikely(tx->next_thread == NULL &&
> +              tx->number_executed_txns % 500 == 0 && tx->restart_total == 1))
> +              reoptimize_htm_execution();
> +#endif
>    if (serial_lock.is_write_locked())
>      {
>        if (tx->nesting > 0)
> @@ -665,12 +917,21 @@ _ITM_commitTransaction(void)
>    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
>      {
>        htm_commit();
> +      gtm_thread *tx = gtm_thr();
> +      if (unlikely(tx == NULL))
> +        {
> +          tx = new gtm_thread();
> +          set_gtm_thr(tx);
> +        }
> +      tx->number_executed_txns++;
>        return;
>      }
>  #endif
>    gtm_thread *tx = gtm_thr();
>    if (!tx->trycommit ())
>      tx->restart (RESTART_VALIDATE_COMMIT);
> +
> +  tx->number_executed_txns++;
>  }
>
>  void ITM_REGPARM
> @@ -681,6 +942,13 @@ _ITM_commitTransactionEH(void *exc_ptr)
>    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
>      {
>        htm_commit();
> +      gtm_thread *tx = gtm_thr();
> +      if (unlikely(tx == NULL))
> +        {
> +          tx = new gtm_thread();
> +          set_gtm_thr(tx);
> +        }
> +      tx->number_executed_txns++;
>        return;
>      }
>  #endif
> @@ -690,4 +958,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
>        tx->eh_in_flight = exc_ptr;
>        tx->restart (RESTART_VALIDATE_COMMIT);
>      }
> +  tx->number_executed_txns++;
>  }
> Index: libitm/config/x86/target.h
> ===================================================================
> --- libitm/config/x86/target.h (revision 219316)
> +++ libitm/config/x86/target.h (working copy)
> @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
>    return begin_ret & _XABORT_RETRY;
>  }
>
> +static inline bool
> +htm_is_capacity_abort(uint32_t begin_ret)
> +{
> +  return begin_ret & _XABORT_CAPACITY;
> +}
> +
>  /* Returns true iff a hardware transaction is currently being executed.  */
>  static inline bool
>  htm_transaction_active ()
>  {
>    return _xtest() != 0;
>  }
> +
> +static inline uint64_t rdtsc()
> +{
> +    uint32_t hi, lo;
> +    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
> +    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
> +}
>  #endif
>
>
> Index: libitm/config/x86/sjlj.S
> ===================================================================
> --- libitm/config/x86/sjlj.S (revision 219316)
> +++ libitm/config/x86/sjlj.S (working copy)
> @@ -59,12 +59,14 @@
>  #define pr_hasNoAbort 0x08
>  #define pr_HTMRetryableAbort 0x800000
>  #define pr_HTMRetriedAfterAbort 0x1000000
> +#define pr_HTMCapacityAbort     0x2000000
>  #define a_runInstrumentedCode 0x01
>  #define a_runUninstrumentedCode 0x02
>  #define a_tryHTMFastPath 0x20
>
>  #define _XABORT_EXPLICIT (1 << 0)
>  #define _XABORT_RETRY (1 << 1)
> +#define _XABORT_CAPACITY (1 << 3)
>
>   .text
>
> @@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction):
>  .Ltxn_abort:
>   /* If it might make sense to retry the HTM fast path, let the C++
>     code decide.  */
> - testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
> + testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax
>   jz .Lno_htm
>   orl $pr_HTMRetryableAbort, %edi
> + testl   $(_XABORT_CAPACITY), %eax
> + jz      .Lno_htm
> + orl     $pr_HTMCapacityAbort, %edi
>   /* Let the C++ code handle the retry policy.  */
>  .Lno_htm:
>  #endif
>
>
>
>
> On Fri, Apr 10, 2015 at 8:24 AM, Andi Kleen <andi@firstfloor.org> wrote:
>> Nuno Diegues <nmld@ist.utl.pt> writes:
>>>
>>> One general question on how to proceed:
>>> given that I make some further changes, should I post the whole patch again?
>>
>> Yes please resend the patch.
>>
>> -Andi
>

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-04-30  3:52           ` Nuno Diegues
  2015-04-30  5:00             ` Andi Kleen
  2015-05-14 22:21             ` [PING] " Andi Kleen
@ 2015-05-18 21:31             ` Torvald Riegel
  2015-05-18 21:48               ` Andi Kleen
  2015-05-19  4:54               ` Nuno Diegues
  2 siblings, 2 replies; 20+ messages in thread
From: Torvald Riegel @ 2015-05-18 21:31 UTC (permalink / raw)
  To: Nuno Diegues; +Cc: Andi Kleen, gcc-patches, Paolo Romano

On Wed, 2015-04-29 at 23:23 -0400, Nuno Diegues wrote: 
> Hello,
> 
> I have taken the chance to improve the patch by addressing the
> comments above in this thread.
> Namely:
>  - to use a simple random generator managed inside the library only
>  - removed floating point usage and replaced by fixed arithmetic
>  - added some comments where relevant

First of all, sorry for taking so long to review this.  Thank you for
the contribution.

There are several high-level changes that I'd like to see (or be
convinced of that those aren't necessary).  I'll discuss these first.
I'll have more detailed comments inlined in the patch.


My major concern is about rdtsc being used.  The relation to frequency
adaption that Andi mentioned is one issue, but the bigger issue to me is
that the runtime of transactions might vary a lot, and that the relation
of txnal vs. nontxnal code executed in a period might vary a lot too.

Are there better options for the utility function, or can we tune it to
be less affected by varying txn length and likelihood of txnal vs.
nontxnal code?  What are the things we want to avoid with the tuning?  I
can think of:
* Not needing to wait for serial txns, or being aborted by a serial txn.
* Not retrying using HTM too much so that the retry time is larger than
the scalability we actually gain by having other txns commit
concurrently.

Anything else?  Did you consider other utility functions during your
research?  Also, note that the mitigation strategy for rdtsc
short-comings that you mention in the paper is not applicable in
general, specifically not in libitm.


Another issue is that we need to implement the tuning in a portable way.
You currently make it depend on whether the assembler knows about RTM,
whereas the rest of the code makes this more portable and defers to
arch-specific files.  I'd prefer if we could use the tuning on other
archs too.  But for that, we need to cleanly separate generic from
arch-specific parts.  That affects magic numbers as well as things like
rdtsc().


Generally, the patch needs to include more comments.  Most importantly,
we need to document why we implemented things the way we did, or do
tuning the way we do.  In cases where the intent isn't trivial to see
nor the code is trivial, explain the intent or reference the place where
you document the big picture.
A good rule of thumb is adding comments whenever it's not simple to see
why we use one of several possible implementation alternatives.

The magic numbers being used are a good example.  Make them constants
and don't just put them directly in the code, and then add documentation
why you chose this number and why it's okay to make that choice.  If it
isn't necessarily okay (eg, other archs, future systems), but you don't
have a better solution right now, add something like "??? Not ideal
because of XYZ.".  If there's a source for some of the magic numbers,
cite it (e.g., [28] in your paper might be one for the tuning
thresholds, I guess).


Reoptimizing only in a specific, fixed thread is insufficient in the
general case.  There may be a thread that only runs an initial txn and
then never another one; if this happens to be the last thread
registered, you'll never get any tuning.  If we want the tuning to be
effective, one of the threads *actively* running txns needs to tune
eventually, always.

Depending on that, you'll probably have to change the sync between the
tuning thread and others.  Serial mode may be a simple way to do that.
It may be possible to only check for tuning being necessary when in
serial mode.  It could be possible that we end up trying HTM too often
yet never go to serial mode; however, that seems unlikely to me (but I
haven't thought thoroughly about it).

Also, please remember that only data-race-free code is strictly correct
C++ code (the only exception being the validated-loads special case in
the STM implementations, for which C++ doesn't have support yet (but for
which proposals exist)).  Thus, use atomic operations accordingly.  That
might create another issue though in that you can't assume 64b atomic
ops to be supported on all archs.  Maybe using serial mode for tuning
doesn't trigger that issue though.


I'm wondering about whether it really makes sense to treat XABORT like
conflicts and other abort reasons, instead of like capacity aborts.
Perhaps we need to differentiate between the explicit abort codes glibc
uses, so that we can distinguish between cases where an abort is
supposed to signal incompatibility with txnal execution and cases where
it's just used for lock elision (see sysdeps/unix/sysv/linux/x86/hle.h
in current glibc):
#define _ABORT_LOCK_BUSY        0xff
#define _ABORT_LOCK_IS_LOCKED   0xfe
#define _ABORT_NESTED_TRYLOCK   0xfd


Andi said that it would be nice to have an env var turning tuning
on/off.  I agree in principle, yet would prefer to have the tuning be
the default.  What about adding an "htm_notune" option in
parse_default_method?  It would be even nicer to have it accept the
number of retries (so that htm_init initializes htm_fastpath to the
user-supplied number instead of to 2, for example).


Finally, please pay attention to using the same leading tabs/spaces
style as current libitm code; it could be just your MUA, but it seems
that your patch uses just leading spaces.  libitm uses 8-char tabs for
leading space.


> Index: libitm/libitm_i.h
> ===================================================================
> --- libitm/libitm_i.h (revision 219316)
> +++ libitm/libitm_i.h (working copy)
> @@ -242,6 +242,9 @@ struct gtm_thread
>    uint32_t restart_reason[NUM_RESTARTS];
>    uint32_t restart_total;
> 
> +  // Keeps track of how many transactions were successfully executed.
> +  uint64_t number_executed_txns;
> +
>    // *** The shared part of gtm_thread starts here. ***
>    // Shared state is on separate cachelines to avoid false sharing with
>    // thread-local parts of gtm_thread.
> @@ -286,6 +289,8 @@ struct gtm_thread
>    static void *operator new(size_t);
>    static void operator delete(void *);
> 
> +  static void reoptimize_htm_execution();

This should be a member function of gtm_global_optimizer, I believe.

> +
>    // Invoked from assembly language, thus the "asm" specifier on
>    // the name, avoiding complex name mangling.
>    static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
> @@ -309,6 +314,59 @@ struct gtm_thread
>    void commit_user_actions ();
>  };
> 
> +// Different ways to deal with capacity aborts in HTM execution.
> +enum gtm_capacity_abort_mode
> +{
> +  STUBBORN,
> +  HALVEN,
> +  GIVEUP
> +};
> +
> +// Definition of fixed point arithmetic types.
> +// Half the bits are dedicated to the fractional type, and the rest to the
> +// "whole" part.
> +#define FIXED_PT_WIDTH  64
> +#define FIXED_PT_INTEGER_WIDTH  32

Are 64b operations sufficient (ie, 32b fixed-point)?

Also, please put everything related to this into a simple struct with
member functions, instead of the standalone typedefs.

> +typedef uint64_t fixed_pt_t;
> +typedef __uint128_t fixed_pt_td;
> +
> +#define FIXED_PT_FRAC_WIDTH     (FIXED_PT_WIDTH - FIXED_PT_INTEGER_WIDTH)
> +#define FIXED_PT_ONE    ((fixed_pt_t)((fixed_pt_t)1 << FIXED_PT_FRAC_WIDTH))
> +#define FIXED_PT_TWO    (FIXED_PT_ONE + FIXED_PT_ONE)
> +
> +#define fixed_mul(A,B) \
> +  ((fixed_pt_t)(((fixed_pt_td)(A) * (fixed_pt_td)(B)) >> FIXED_PT_FRAC_WIDTH))
> +#define fixed_div(A,B) \
> +  ((fixed_pt_t)(((fixed_pt_td)(A) << FIXED_PT_FRAC_WIDTH) / (fixed_pt_td)(B)))
> +#define fixed_const(R) \
> +  ((fixed_pt_t)((R) * FIXED_PT_ONE + ((R) >= 0 ? 0.5 : -0.5)))

Those should be member functions of the struct, with fixed_const() being
a constructor.

> +// Maintains the current values optimized for HTM execution and the
> +// corresponding statistics gathered for the decision-making.
> +struct gtm_global_optimizer
> +{
> +  // Mode chosen to currently deal with capacity aborts.
> +  gtm_capacity_abort_mode optimized_mode;
> +  // Number of attempts chosen to currently insist on HTM execution.
> +  uint32_t optimized_attempts;
> +
> +  uint64_t last_cycles;
> +  uint64_t last_total_txs_executed;
> +
> +  fixed_pt_t last_throughput;
> +  uint32_t last_attempts;
> +
> +  fixed_pt_t best_ever_throughput;
> +  uint32_t best_ever_attempts;
> +
> +  uint64_t txns_while_stubborn;
> +  uint64_t cycles_while_stubborn;
> +  uint64_t txns_while_halven;
> +  uint64_t cycles_while_halven;
> +  uint64_t txns_while_giveup;
> +  uint64_t cycles_while_giveup;
> +};

Those will eventually need more comments or a reference to where the
tuning algorithm is explained.  We can tackle that later once the
algorithm is final.



>  } // namespace GTM
> 
>  #include "tls.h"
> @@ -346,6 +404,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
>  // the name, avoiding complex name mangling.
>  extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");
> 
> +// Maintains the optimization for HTM execution.
> +extern gtm_global_optimizer optimizer;
> +
>  } // namespace GTM
> 
>  #endif // LIBITM_I_H
> Index: libitm/libitm.h
> ===================================================================
> --- libitm/libitm.h (revision 219316)
> +++ libitm/libitm.h (working copy)
> @@ -101,7 +101,8 @@ typedef enum
>     /* These are not part of the ABI but used for custom HTM fast paths.  See
>        ITM_beginTransaction and gtm_thread::begin_transaction.  */
>     pr_HTMRetryableAbort = 0x800000,
> -   pr_HTMRetriedAfterAbort = 0x1000000
> +   pr_HTMRetriedAfterAbort = 0x1000000,
> +   pr_HTMCapacityAbort          = 0x2000000

pr_HTMCapacityAbort needs documentation (unlike the other two, it's not
explained in gtm_thread::begin_transaction.

>  } _ITM_codeProperties;
> 
>  /* Result from startTransaction that describes what actions to take.
> Index: libitm/method-serial.cc
> ===================================================================
> --- libitm/method-serial.cc (revision 219316)
> +++ libitm/method-serial.cc (working copy)
> @@ -223,7 +223,23 @@ struct htm_mg : public method_group
>      // initially disabled.
>  #ifdef USE_HTM_FASTPATH
>      htm_fastpath = htm_init();
> +#ifdef HAVE_AS_RTM

This needs to be portable, so we either have to have another function
such as htm_init(), or have a arch-generic initializer if that is
possible.

> +    optimizer.optimized_mode = STUBBORN;
> +    optimizer.optimized_attempts = htm_fastpath;
> +    optimizer.last_cycles = rdtsc();
> +    optimizer.last_total_txs_executed = 0;
> +    optimizer.last_throughput = fixed_const(0.0001);
> +    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
> +    optimizer.best_ever_throughput = optimizer.last_throughput;
> +    optimizer.best_ever_attempts = htm_fastpath;
> +    optimizer.txns_while_stubborn = 1;
> +    optimizer.cycles_while_stubborn = 100;

If the assumption is that transactions can't be faster than 100 cycles,
please document it briefly.

> +    optimizer.txns_while_halven = 1;
> +    optimizer.cycles_while_halven = 100;
> +    optimizer.txns_while_giveup = 1;
> +    optimizer.cycles_while_giveup = 100;
>  #endif
> +#endif
>    }
>    virtual void fini()
>    {
> Index: libitm/beginend.cc
> ===================================================================
> --- libitm/beginend.cc (revision 219316)
> +++ libitm/beginend.cc (working copy)
> @@ -25,7 +25,6 @@
>  #include "libitm_i.h"
>  #include <pthread.h>
> 
> -

Minor: just drop this chunk.

> using namespace GTM;
> 
>  #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP)
> @@ -39,6 +38,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
>  gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
>  atomic<gtm_version> GTM::gtm_clock;
> 
> +// Optimization of HTM executions.
> +gtm_global_optimizer GTM::optimizer;
> +
>  /* ??? Move elsewhere when we figure out library initialization.  */
>  uint64_t GTM::gtm_spin_count_var = 1000;
> 
> @@ -149,6 +151,225 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
>      return a_runInstrumentedCode;
>  }
> 
> +static inline fixed_pt_t

Drop the inline, or use __libitm_always_inline (after Gleb's recent
patch) if the compiler doesn't do proper inlining.

> +fixed_sqrt(fixed_pt_t n)
> +{
> +    int invert = 0;
> +    int iter = FIXED_PT_FRAC_WIDTH;
> +    int l, i;
> +
> +    if (n == 0 || n == FIXED_PT_ONE)
> +      {
> +        return n;
> +      }
> +    if (n < FIXED_PT_ONE && n > 6)
> +      {
> +        invert = 1;
> +        n = fixed_div(FIXED_PT_ONE, n);
> +      }
> +    if (n > FIXED_PT_ONE)
> +      {
> +        int s = n;
> +        iter = 0;
> +        while (s > 0)
> +          {
> +            s >>= 2;
> +            iter++;
> +          }
> +      }
> +
> +    l = (n >> 1) + 1;
> +    for (i = 0; i < iter; i++)
> +      {
> +        l = (l + fixed_div(n, l)) >> 1;
> +      }
> +    if (invert)
> +      {
> +        return (fixed_div(FIXED_PT_ONE, l));
> +      }
> +    return l;
> +}
> +
> +static inline fixed_pt_t
> +fixed_ln(fixed_pt_t x)
> +{
> +  const fixed_pt_t LN2 = fixed_const(0.69314718055994530942);
> +  const fixed_pt_t LG[7] = {
> +    fixed_const(6.666666666666735130e-01),
> +    fixed_const(3.999999999940941908e-01),
> +    fixed_const(2.857142874366239149e-01),
> +    fixed_const(2.222219843214978396e-01),
> +    fixed_const(1.818357216161805012e-01),
> +    fixed_const(1.531383769920937332e-01),
> +    fixed_const(1.479819860511658591e-01)
> +  };
> +
> +  if (x == 0)
> +    {
> +      return 0xffffffff;
> +    }
> +
> +  fixed_pt_t log2 = 0;
> +  fixed_pt_t xi = x;
> +  while (xi > FIXED_PT_TWO)
> +    {
> +      xi >>= 1;
> +      log2++;
> +    }
> +
> +  fixed_pt_t f = xi - FIXED_PT_ONE;
> +  fixed_pt_t s = fixed_div(f, FIXED_PT_TWO + f);
> +  fixed_pt_t z = fixed_mul(s, s);
> +  fixed_pt_t w = fixed_mul(z, z);
> +  fixed_pt_t R =
> +    fixed_mul(w, LG[1] + fixed_mul(w, LG[3] + fixed_mul(w, LG[5])))
> +    + fixed_mul(z, LG[0] + fixed_mul(w, LG[2]
> +        + fixed_mul(w, LG[4] + fixed_mul(w, LG[6]))));
> +  return fixed_mul(LN2, (log2 << FIXED_PT_FRAC_WIDTH))
> +    + f - fixed_mul(s, f - R);
> +}
> +
> +// State not synchronized; assumes single thread usage at any time.
> +// Invoked only by the thread reoptimizing, so assumption holds.
> +int
> +obtainRandomInt(int max)
> +{
> +  static int seed = 123456789;
> +  seed = (1103515245 * seed + 12345) % 4294967296;
> +  return seed;
> +}

Please change this into a simple struct holding the seed and put it into
common.h for now.  No CamelCase.  You should also use uint32_t when
using constants like that, and there's no need for the modulo.
Also add a quick comment on which RNG this is (and possibly the source
of the constants)?

> +
> +// Called by the thread at the tail of the list of threads.
> +void
> +GTM::gtm_thread::reoptimize_htm_execution()
> +{
> +  gtm_thread *tx = gtm_thr();
> +  uint64_t total_txs_executed = 0;
> +
> +  // Collect the statistics obtained so far.
> +  serial_lock.read_lock(tx);
> +  gtm_thread *ptr = list_of_threads;
> +  for (; ptr; ptr = ptr->next_thread)
> +    {
> +      total_txs_executed += ptr->number_executed_txns;
> +    }
> +  serial_lock.read_unlock(tx);
> +
> +  // Obtain the delta performance with respect to the last period.
> +  const uint64_t current_cycles = rdtsc();
> +  const uint64_t cycles_used = current_cycles - optimizer.last_cycles;
> +  const uint64_t new_txs_executed =
> +    total_txs_executed - optimizer.last_total_txs_executed;
> +  optimizer.last_cycles = current_cycles;
> +  optimizer.last_total_txs_executed = total_txs_executed;
> +  if (optimizer.optimized_mode == STUBBORN)
> +    {
> +      optimizer.txns_while_stubborn += new_txs_executed;
> +      optimizer.cycles_while_stubborn += cycles_used;
> +    }
> +  else if (optimizer.optimized_mode == HALVEN)
> +    {
> +      optimizer.txns_while_halven += new_txs_executed;
> +      optimizer.cycles_while_halven += cycles_used;
> +    }
> +  else
> +    {
> +      optimizer.txns_while_giveup += new_txs_executed;
> +      optimizer.cycles_while_giveup += cycles_used;
> +    }
> +
> +  // Compute Upper Confidence Bounds for the mode to choose next.
> +  // Use fixed point arithmetic types to spare floating point usage.
> +  const fixed_pt_t log_sum =
> +    fixed_mul(FIXED_PT_TWO,
> +      fixed_ln(fixed_const(optimizer.txns_while_stubborn)
> +               + fixed_const(optimizer.txns_while_halven)
> +               + fixed_const(optimizer.txns_while_giveup)));
> +  const fixed_pt_t ucb_stubborn =
> +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_stubborn),
> +                        fixed_const(optimizer.cycles_while_stubborn)),
> +              optimizer.best_ever_throughput)
> +    + fixed_sqrt(fixed_div(log_sum,
> +                           fixed_const(optimizer.txns_while_stubborn)));
> +  const fixed_pt_t ucb_halven =
> +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_halven),
> +                        fixed_const(optimizer.cycles_while_halven)),
> +              optimizer.best_ever_throughput)
> +    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_halven)));
> +  const fixed_pt_t ucb_giveup =
> +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_giveup),
> +                        fixed_const(optimizer.cycles_while_giveup)),
> +              optimizer.best_ever_throughput)
> +    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_giveup)));
> +
> +  if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup)
> +    {
> +      optimizer.optimized_mode = STUBBORN;
> +    }
> +  else if (ucb_halven > ucb_giveup)
> +    {
> +      optimizer.optimized_mode = HALVEN;
> +    }
> +  else
> +    {
> +      optimizer.optimized_mode = GIVEUP;
> +    }
> +
> +  // Compute gradient descent for the number of retries.
> +  const fixed_pt_t current_throughput =
> +    fixed_div(fixed_const(new_txs_executed), fixed_const(cycles_used));
> +  const fixed_pt_t change_for_better =
> +    fixed_div(current_throughput, optimizer.last_throughput);
> +  const fixed_pt_t change_for_worse =
> +    fixed_div(optimizer.last_throughput, current_throughput);
> +  int32_t last_attempts = optimizer.last_attempts;
> +  int32_t current_attempts = optimizer.optimized_attempts;
> +  int32_t new_attempts = current_attempts;
> +  if (unlikely(change_for_worse > fixed_const(1.40)))
> +    {
> +      optimizer.optimized_attempts = optimizer.best_ever_attempts;
> +      optimizer.last_throughput = current_throughput;
> +      optimizer.last_attempts = current_attempts;
> +      return;
> +    }
> +
> +  if (unlikely(obtainRandomInt(100) == 0))
> +    {
> +      optimizer.last_attempts = optimizer.optimized_attempts;
> +      optimizer.last_throughput = current_throughput;
> +      optimizer.optimized_attempts = obtainRandomInt(18) + 2;
> +      return;
> +    }
> +
> +      if (change_for_better > fixed_const(1.05))

Indentation is off.  Perhaps due to badly mixed space/tab indentation?

> +        {
> +          new_attempts += current_attempts - last_attempts;
> +          if (current_attempts == last_attempts)
> +            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
> +        }
> +      else if (change_for_worse > fixed_const(1.05))
> +        {
> +          new_attempts -= current_attempts - last_attempts;
> +          if (current_attempts == last_attempts)
> +            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
> +        }
> +      if (unlikely(new_attempts > 20))
> +        new_attempts = 20;
> +      else if (unlikely(new_attempts < 2))
> +        new_attempts = 2;
> +      if (current_attempts != new_attempts)
> +        {
> +          optimizer.last_attempts = current_attempts;
> +          optimizer.last_throughput = current_throughput;
> +        }
> +      optimizer.optimized_attempts = new_attempts;
> +      if (unlikely(current_throughput > optimizer.best_ever_throughput))
> +        {
> +          optimizer.best_ever_throughput = current_throughput;
> +          optimizer.best_ever_attempts = current_attempts;
> +        }
> +}

I'll look at this one once we've finalized the tuning algorithm.

> +
>  uint32_t
>  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
>  {
> @@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>  #ifndef HTM_CUSTOM_FASTPATH
>    if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
>      {
> -      for (uint32_t t = htm_fastpath; t; t--)
> +      for (uint32_t t = optimizer.optimized_attempts; t; t--)

Why this change?  Couldn't you keep htm_fastpath as the globally set
number of attempts?

>   {
>    uint32_t ret = htm_begin();
>    if (htm_begin_success(ret))
> @@ -209,19 +430,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>      }
>    // The transaction has aborted.  Don't retry if it's unlikely that
>    // retrying the transaction will be successful.
> -  if (!htm_abort_should_retry(ret))
> +#ifdef HAVE_AS_RTM
> +          if (htm_is_capacity_abort(ret))
> +            {
> +              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
> +              if (capacity_mode == HALVEN)
> +                t = t / 2 + 1;
> +              else if (capacity_mode == GIVEUP)
> +                t = 1;
> +            }
> +          // Only one thread performs this to avoid the need for
> +          // synchronization. We use the oldest thread that is active.
> +          // We also reoptimize at most once per transaction.
> +          if (unlikely(tx->next_thread == NULL &&
> +              tx->number_executed_txns % 500 == 0 && t == 1))
> +              reoptimize_htm_execution();
> +#else
> +          if (!htm_abort_should_retry(ret))
>      break;
> +#endif
>    // Wait until any concurrent serial-mode transactions have finished.
>    // This is an empty critical section, but won't be elided.
>    if (serial_lock.is_write_locked())
>      {
> -      tx = gtm_thr();
> -      if (unlikely(tx == NULL))
> -        {
> -          // See below.
> -          tx = new gtm_thread();
> -          set_gtm_thr(tx);
> -        }
> +              tx = gtm_thr();
> +              if (unlikely(tx == NULL))
> +                {
> +                  // See below.
> +                  tx = new gtm_thread();
> +                  set_gtm_thr(tx);
> +                }

You seemed to have changed just tabs to whitespace.

>        // Check whether there is an enclosing serial-mode transaction;
>        // if so, we just continue as a nested transaction and don't
>        // try to use the HTM fastpath.  This case can happen when an
> @@ -262,12 +500,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>        // other fallback will use serial transactions, which don't use
>        // restart_total but will reset it when committing.
>        if (!(prop & pr_HTMRetriedAfterAbort))
> - tx->restart_total = htm_fastpath;
> +        tx->restart_total = optimizer.optimized_attempts;
> 
>        if (--tx->restart_total > 0)
>   {
>    // Wait until any concurrent serial-mode transactions have finished.
>    // Essentially the same code as above.

This comment belongs to the code below.

> +#ifdef HAVE_AS_RTM
> +          if (prop & pr_HTMCapacityAbort)
> +            {
> +              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
> +              if (capacity_mode == HALVEN)
> +                tx->restart_total = tx->restart_total;

That's a no-op.  If you measured with this code, it seems you never
actually tried the HALVEN strategy.  Does that mean we don't need
HALVEN, or do you get other results now?

> +              else if (capacity_mode == GIVEUP)
> +                goto stop_custom_htm_fastpath;
> +            }
> +
> +          if (unlikely(tx->next_thread == NULL &&
> +              tx->number_executed_txns % 500 == 0 && tx->restart_total == 1))
> +              reoptimize_htm_execution();
> +#endif
>    if (serial_lock.is_write_locked())
>      {
>        if (tx->nesting > 0)
> @@ -665,12 +917,21 @@ _ITM_commitTransaction(void)
>    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
>      {
>        htm_commit();
> +      gtm_thread *tx = gtm_thr();
> +      if (unlikely(tx == NULL))
> +        {
> +          tx = new gtm_thread();
> +          set_gtm_thr(tx);
> +        }

Are there actually situations in which we need to create a gtm_thread?

> +      tx->number_executed_txns++;

It might be nice if we can avoid this, so that we don't touch the
additional cacheline when we have nested txns.

>        return;
>      }
>  #endif
>    gtm_thread *tx = gtm_thr();
>    if (!tx->trycommit ())
>      tx->restart (RESTART_VALIDATE_COMMIT);
> +
> +  tx->number_executed_txns++;
>  }
> 
>  void ITM_REGPARM
> @@ -681,6 +942,13 @@ _ITM_commitTransactionEH(void *exc_ptr)
>    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
>      {
>        htm_commit();
> +      gtm_thread *tx = gtm_thr();
> +      if (unlikely(tx == NULL))
> +        {
> +          tx = new gtm_thread();
> +          set_gtm_thr(tx);
> +        }
> +      tx->number_executed_txns++;
>        return;
>      }
>  #endif
> @@ -690,4 +958,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
>        tx->eh_in_flight = exc_ptr;
>        tx->restart (RESTART_VALIDATE_COMMIT);
>      }
> +  tx->number_executed_txns++;
>  }
> Index: libitm/config/x86/target.h
> ===================================================================
> --- libitm/config/x86/target.h (revision 219316)
> +++ libitm/config/x86/target.h (working copy)
> @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
>    return begin_ret & _XABORT_RETRY;
>  }
> 
> +static inline bool
> +htm_is_capacity_abort(uint32_t begin_ret)
> +{
> +  return begin_ret & _XABORT_CAPACITY;
> +}

Is this indeed just about capacity, or do we actually mean a more
general situation?

> +
>  /* Returns true iff a hardware transaction is currently being executed.  */
>  static inline bool
>  htm_transaction_active ()
>  {
>    return _xtest() != 0;
>  }
> +
> +static inline uint64_t rdtsc()
> +{
> +    uint32_t hi, lo;
> +    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
> +    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
> +}
>  #endif
> 
> 
> Index: libitm/config/x86/sjlj.S
> ===================================================================
> --- libitm/config/x86/sjlj.S (revision 219316)
> +++ libitm/config/x86/sjlj.S (working copy)
> @@ -59,12 +59,14 @@
>  #define pr_hasNoAbort 0x08
>  #define pr_HTMRetryableAbort 0x800000
>  #define pr_HTMRetriedAfterAbort 0x1000000
> +#define pr_HTMCapacityAbort     0x2000000
>  #define a_runInstrumentedCode 0x01
>  #define a_runUninstrumentedCode 0x02
>  #define a_tryHTMFastPath 0x20
> 
>  #define _XABORT_EXPLICIT (1 << 0)
>  #define _XABORT_RETRY (1 << 1)
> +#define _XABORT_CAPACITY (1 << 3)
> 
>   .text
> 
> @@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction):
>  .Ltxn_abort:
>   /* If it might make sense to retry the HTM fast path, let the C++
>     code decide.  */
> - testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
> + testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax
>   jz .Lno_htm
>   orl $pr_HTMRetryableAbort, %edi
> + testl   $(_XABORT_CAPACITY), %eax
> + jz      .Lno_htm
> + orl     $pr_HTMCapacityAbort, %edi
>   /* Let the C++ code handle the retry policy.  */
>  .Lno_htm:
>  #endif



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-05-18 21:31             ` Torvald Riegel
@ 2015-05-18 21:48               ` Andi Kleen
  2015-05-18 22:04                 ` Torvald Riegel
  2015-05-19  4:54               ` Nuno Diegues
  1 sibling, 1 reply; 20+ messages in thread
From: Andi Kleen @ 2015-05-18 21:48 UTC (permalink / raw)
  To: Torvald Riegel; +Cc: Nuno Diegues, Andi Kleen, gcc-patches, Paolo Romano

> Are there better options for the utility function, or can we tune it to

There is nothing better that isn't a lot slower.

-Andi

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-05-18 21:48               ` Andi Kleen
@ 2015-05-18 22:04                 ` Torvald Riegel
  0 siblings, 0 replies; 20+ messages in thread
From: Torvald Riegel @ 2015-05-18 22:04 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Nuno Diegues, gcc-patches, Paolo Romano

On Mon, 2015-05-18 at 23:39 +0200, Andi Kleen wrote:
> > Are there better options for the utility function, or can we tune it to
> 
> There is nothing better that isn't a lot slower.

Do you care to elaborate why?  As-is, I find this statement to not be
convincing; at the very least we need to document why we think that
something time-based is the best option.

Other tuning attempts have looked at rates of aborted, attempted, and
committed txns, for example.  Why do we measure nb. of transactions in a
whole period, and can't get the same info through measuring smaller but
more specific time intervals (e.g., how long we wait for a serial txn)?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-05-18 21:31             ` Torvald Riegel
  2015-05-18 21:48               ` Andi Kleen
@ 2015-05-19  4:54               ` Nuno Diegues
  2015-05-19 22:31                 ` Torvald Riegel
  1 sibling, 1 reply; 20+ messages in thread
From: Nuno Diegues @ 2015-05-19  4:54 UTC (permalink / raw)
  To: Torvald Riegel; +Cc: Andi Kleen, gcc-patches, Paolo Romano

On Mon, May 18, 2015 at 5:29 PM, Torvald Riegel <triegel@redhat.com> wrote:
>
> First of all, sorry for taking so long to review this.  Thank you for
> the contribution.


Hello Torvald,

thanks for taking the time to look into this!


> My major concern is about rdtsc being used.  The relation to frequency
> adaption that Andi mentioned is one issue, but the bigger issue to me is
> that the runtime of transactions might vary a lot, and that the relation
> of txnal vs. nontxnal code executed in a period might vary a lot too.


Once again, I believe that frequency scaling should not be a concern: recent
CPUs use a constant rate TSC that matches that of the maximum frequency
of the CPU.


>
> Are there better options for the utility function, or can we tune it to
> be less affected by varying txn length and likelihood of txnal vs.
> nontxnal code?  What are the things we want to avoid with the tuning?  I
> can think of:
> * Not needing to wait for serial txns, or being aborted by a serial txn.
> * Not retrying using HTM too much so that the retry time is larger than
> the scalability we actually gain by having other txns commit
> concurrently.


Yes, those are the key points we want to make sure that do not happen.

>
>
> Anything else?  Did you consider other utility functions during your
> research?


The txnal vs nontxnal is indeed a completely different story. To account for
this we would need extra book-keeping to count only cycles spent inside
txnal code. So this would require every thread (or a sample of threads) to
perform a rdtsc (or equivalent) on every begin/end call rather than the
current approach of a single rdtsc per optimization round.

With this type of online optimization we found that the algorithm had to be
very simple and cheap to execute. RDTSC was a good finding to fit this, and
it enabled us to obtain gains. Other time sources failed to do so.

I do not have, out of the box, a good alternative to offer. I suppose it would
take some iterations of thinking/experimenting with, just like with any research
problem :)


> Also, note that the mitigation strategy for rdtsc
> short-comings that you mention in the paper is not applicable in
> general, specifically not in libitm.


I suppose you mean the preemption of threads inflating the cycles measured?
This would be similarly a problem to any time source that tries to measure the
amount of work performed; not sure how we can avoid it in general. Any thoughts?


> Another issue is that we need to implement the tuning in a portable way.
> You currently make it depend on whether the assembler knows about RTM,
> whereas the rest of the code makes this more portable and defers to
> arch-specific files.  I'd prefer if we could use the tuning on other
> archs too.  But for that, we need to cleanly separate generic from
> arch-specific parts.  That affects magic numbers as well as things like
> rdtsc().


Yes, I refrained from adding new calls to the arch-specific files, to
contain the
changes mainly. But that is possible and that's part of the feedback I
was hoping
to get.



> Generally, the patch needs to include more comments.  Most importantly,
> we need to document why we implemented things the way we did, or do
> tuning the way we do.  In cases where the intent isn't trivial to see
> nor the code is trivial, explain the intent or reference the place where
> you document the big picture.
> A good rule of thumb is adding comments whenever it's not simple to see
> why we use one of several possible implementation alternatives.
>
> The magic numbers being used are a good example.  Make them constants
> and don't just put them directly in the code, and then add documentation
> why you chose this number and why it's okay to make that choice.  If it
> isn't necessarily okay (eg, other archs, future systems), but you don't
> have a better solution right now, add something like "??? Not ideal
> because of XYZ.".  If there's a source for some of the magic numbers,
> cite it (e.g., [28] in your paper might be one for the tuning
> thresholds, I guess).


Ack, makes perfect sense.



> Reoptimizing only in a specific, fixed thread is insufficient in the
> general case.  There may be a thread that only runs an initial txn and
> then never another one; if this happens to be the last thread
> registered, you'll never get any tuning.  If we want the tuning to be
> effective, one of the threads *actively* running txns needs to tune
> eventually, always.
>
> Depending on that, you'll probably have to change the sync between the
> tuning thread and others.  Serial mode may be a simple way to do that.
> It may be possible to only check for tuning being necessary when in
> serial mode.  It could be possible that we end up trying HTM too often
> yet never go to serial mode; however, that seems unlikely to me (but I
> haven't thought thoroughly about it).
>
> Also, please remember that only data-race-free code is strictly correct
> C++ code (the only exception being the validated-loads special case in
> the STM implementations, for which C++ doesn't have support yet (but for
> which proposals exist)).  Thus, use atomic operations accordingly.  That
> might create another issue though in that you can't assume 64b atomic
> ops to be supported on all archs.  Maybe using serial mode for tuning
> doesn't trigger that issue though.


Re-optimizing in a concurrent fashion is something we should really avoid.
I had some attempts at doing so, and the synchronization will easily kill the
gains. The current approach was really nice from the performance perspective,
but I understand your concern.
The main problem with using the serial lock for that is the obvious disadvantage
that it increases the time it is taken (forbidding hardware transactions). This
should be quite small in the grand picture of things, so it is worth
giving it a shot
and see how it goes in several workloads.


>
> I'm wondering about whether it really makes sense to treat XABORT like
> conflicts and other abort reasons, instead of like capacity aborts.
> Perhaps we need to differentiate between the explicit abort codes glibc
> uses, so that we can distinguish between cases where an abort is
> supposed to signal incompatibility with txnal execution and cases where
> it's just used for lock elision (see sysdeps/unix/sysv/linux/x86/hle.h
> in current glibc):
> #define _ABORT_LOCK_BUSY        0xff
> #define _ABORT_LOCK_IS_LOCKED   0xfe
> #define _ABORT_NESTED_TRYLOCK   0xfd


I am not sure I follow: are you suggesting to consider other aborts besides
those of capacity?


>
>
> Andi said that it would be nice to have an env var turning tuning
> on/off.  I agree in principle, yet would prefer to have the tuning be
> the default.  What about adding an "htm_notune" option in
> parse_default_method?  It would be even nicer to have it accept the
> number of retries (so that htm_init initializes htm_fastpath to the
> user-supplied number instead of to 2, for example).


Will do.

>
>
>
> Finally, please pay attention to using the same leading tabs/spaces
> style as current libitm code; it could be just your MUA, but it seems
> that your patch uses just leading spaces.  libitm uses 8-char tabs for
> leading space.


Now that I know the style, I will enforce it ;)
Previously I had trusted my editor to follow the current style, but maybe
it was not consistent everywhere, or it just got confused.




>
> > Index: libitm/libitm_i.h
> > ===================================================================
> > --- libitm/libitm_i.h (revision 219316)
> > +++ libitm/libitm_i.h (working copy)
> > @@ -242,6 +242,9 @@ struct gtm_thread
> >    uint32_t restart_reason[NUM_RESTARTS];
> >    uint32_t restart_total;
> >
> > +  // Keeps track of how many transactions were successfully executed.
> > +  uint64_t number_executed_txns;
> > +
> >    // *** The shared part of gtm_thread starts here. ***
> >    // Shared state is on separate cachelines to avoid false sharing with
> >    // thread-local parts of gtm_thread.
> > @@ -286,6 +289,8 @@ struct gtm_thread
> >    static void *operator new(size_t);
> >    static void operator delete(void *);
> >
> > +  static void reoptimize_htm_execution();
>
> This should be a member function of gtm_global_optimizer, I believe.


Ack

>
>
> > +
> >    // Invoked from assembly language, thus the "asm" specifier on
> >    // the name, avoiding complex name mangling.
> >    static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
> > @@ -309,6 +314,59 @@ struct gtm_thread
> >    void commit_user_actions ();
> >  };
> >
> > +// Different ways to deal with capacity aborts in HTM execution.
> > +enum gtm_capacity_abort_mode
> > +{
> > +  STUBBORN,
> > +  HALVEN,
> > +  GIVEUP
> > +};
> > +
> > +// Definition of fixed point arithmetic types.
> > +// Half the bits are dedicated to the fractional type, and the rest to the
> > +// "whole" part.
> > +#define FIXED_PT_WIDTH  64
> > +#define FIXED_PT_INTEGER_WIDTH  32
>
> Are 64b operations sufficient (ie, 32b fixed-point)?
>
> Also, please put everything related to this into a simple struct with
> member functions, instead of the standalone typedefs.


Ack

>
>
> > +typedef uint64_t fixed_pt_t;
> > +typedef __uint128_t fixed_pt_td;
> > +
> > +#define FIXED_PT_FRAC_WIDTH     (FIXED_PT_WIDTH - FIXED_PT_INTEGER_WIDTH)
> > +#define FIXED_PT_ONE    ((fixed_pt_t)((fixed_pt_t)1 << FIXED_PT_FRAC_WIDTH))
> > +#define FIXED_PT_TWO    (FIXED_PT_ONE + FIXED_PT_ONE)
> > +
> > +#define fixed_mul(A,B) \
> > +  ((fixed_pt_t)(((fixed_pt_td)(A) * (fixed_pt_td)(B)) >> FIXED_PT_FRAC_WIDTH))
> > +#define fixed_div(A,B) \
> > +  ((fixed_pt_t)(((fixed_pt_td)(A) << FIXED_PT_FRAC_WIDTH) / (fixed_pt_td)(B)))
> > +#define fixed_const(R) \
> > +  ((fixed_pt_t)((R) * FIXED_PT_ONE + ((R) >= 0 ? 0.5 : -0.5)))
>
> Those should be member functions of the struct, with fixed_const() being
> a constructor.


That's a good idea, thanks!

>
>
> > +// Maintains the current values optimized for HTM execution and the
> > +// corresponding statistics gathered for the decision-making.
> > +struct gtm_global_optimizer
> > +{
> > +  // Mode chosen to currently deal with capacity aborts.
> > +  gtm_capacity_abort_mode optimized_mode;
> > +  // Number of attempts chosen to currently insist on HTM execution.
> > +  uint32_t optimized_attempts;
> > +
> > +  uint64_t last_cycles;
> > +  uint64_t last_total_txs_executed;
> > +
> > +  fixed_pt_t last_throughput;
> > +  uint32_t last_attempts;
> > +
> > +  fixed_pt_t best_ever_throughput;
> > +  uint32_t best_ever_attempts;
> > +
> > +  uint64_t txns_while_stubborn;
> > +  uint64_t cycles_while_stubborn;
> > +  uint64_t txns_while_halven;
> > +  uint64_t cycles_while_halven;
> > +  uint64_t txns_while_giveup;
> > +  uint64_t cycles_while_giveup;
> > +};
>
> Those will eventually need more comments or a reference to where the
> tuning algorithm is explained.  We can tackle that later once the
> algorithm is final.
>


Ack

>
>
>
> >  } // namespace GTM
> >
> >  #include "tls.h"
> > @@ -346,6 +404,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
> >  // the name, avoiding complex name mangling.
> >  extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");
> >
> > +// Maintains the optimization for HTM execution.
> > +extern gtm_global_optimizer optimizer;
> > +
> >  } // namespace GTM
> >
> >  #endif // LIBITM_I_H
> > Index: libitm/libitm.h
> > ===================================================================
> > --- libitm/libitm.h (revision 219316)
> > +++ libitm/libitm.h (working copy)
> > @@ -101,7 +101,8 @@ typedef enum
> >     /* These are not part of the ABI but used for custom HTM fast paths.  See
> >        ITM_beginTransaction and gtm_thread::begin_transaction.  */
> >     pr_HTMRetryableAbort = 0x800000,
> > -   pr_HTMRetriedAfterAbort = 0x1000000
> > +   pr_HTMRetriedAfterAbort = 0x1000000,
> > +   pr_HTMCapacityAbort          = 0x2000000
>
> pr_HTMCapacityAbort needs documentation (unlike the other two, it's not
> explained in gtm_thread::begin_transaction.


Ack

>
>
> >  } _ITM_codeProperties;
> >
> >  /* Result from startTransaction that describes what actions to take.
> > Index: libitm/method-serial.cc
> > ===================================================================
> > --- libitm/method-serial.cc (revision 219316)
> > +++ libitm/method-serial.cc (working copy)
> > @@ -223,7 +223,23 @@ struct htm_mg : public method_group
> >      // initially disabled.
> >  #ifdef USE_HTM_FASTPATH
> >      htm_fastpath = htm_init();
> > +#ifdef HAVE_AS_RTM
>
> This needs to be portable, so we either have to have another function
> such as htm_init(), or have a arch-generic initializer if that is
> possible.


Yes, we discussed this partly above, as this entails making changes to the arch
files.

>
>
> > +    optimizer.optimized_mode = STUBBORN;
> > +    optimizer.optimized_attempts = htm_fastpath;
> > +    optimizer.last_cycles = rdtsc();
> > +    optimizer.last_total_txs_executed = 0;
> > +    optimizer.last_throughput = fixed_const(0.0001);
> > +    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
> > +    optimizer.best_ever_throughput = optimizer.last_throughput;
> > +    optimizer.best_ever_attempts = htm_fastpath;
> > +    optimizer.txns_while_stubborn = 1;
> > +    optimizer.cycles_while_stubborn = 100;
>
> If the assumption is that transactions can't be faster than 100 cycles,
> please document it briefly.


No, it does not imply that. This is an "accumulator" of cycles spent in
the mode; it need only to be positive as far as I remember.


>
> > +    optimizer.txns_while_halven = 1;
> > +    optimizer.cycles_while_halven = 100;
> > +    optimizer.txns_while_giveup = 1;
> > +    optimizer.cycles_while_giveup = 100;
> >  #endif
> > +#endif
> >    }
> >    virtual void fini()
> >    {
> > Index: libitm/beginend.cc
> > ===================================================================
> > --- libitm/beginend.cc (revision 219316)
> > +++ libitm/beginend.cc (working copy)
> > @@ -25,7 +25,6 @@
> >  #include "libitm_i.h"
> >  #include <pthread.h>
> >
> > -
>
> Minor: just drop this chunk.


What is this referring to?


>
>
> > using namespace GTM;
> >
> >  #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP)
> > @@ -39,6 +38,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
> >  gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
> >  atomic<gtm_version> GTM::gtm_clock;
> >
> > +// Optimization of HTM executions.
> > +gtm_global_optimizer GTM::optimizer;
> > +
> >  /* ??? Move elsewhere when we figure out library initialization.  */
> >  uint64_t GTM::gtm_spin_count_var = 1000;
> >
> > @@ -149,6 +151,225 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
> >      return a_runInstrumentedCode;
> >  }
> >
> > +static inline fixed_pt_t
>
> Drop the inline, or use __libitm_always_inline (after Gleb's recent
> patch) if the compiler doesn't do proper inlining.


Ack

>
>
> > +fixed_sqrt(fixed_pt_t n)
> > +{
> > +    int invert = 0;
> > +    int iter = FIXED_PT_FRAC_WIDTH;
> > +    int l, i;
> > +
> > +    if (n == 0 || n == FIXED_PT_ONE)
> > +      {
> > +        return n;
> > +      }
> > +    if (n < FIXED_PT_ONE && n > 6)
> > +      {
> > +        invert = 1;
> > +        n = fixed_div(FIXED_PT_ONE, n);
> > +      }
> > +    if (n > FIXED_PT_ONE)
> > +      {
> > +        int s = n;
> > +        iter = 0;
> > +        while (s > 0)
> > +          {
> > +            s >>= 2;
> > +            iter++;
> > +          }
> > +      }
> > +
> > +    l = (n >> 1) + 1;
> > +    for (i = 0; i < iter; i++)
> > +      {
> > +        l = (l + fixed_div(n, l)) >> 1;
> > +      }
> > +    if (invert)
> > +      {
> > +        return (fixed_div(FIXED_PT_ONE, l));
> > +      }
> > +    return l;
> > +}
> > +
> > +static inline fixed_pt_t
> > +fixed_ln(fixed_pt_t x)
> > +{
> > +  const fixed_pt_t LN2 = fixed_const(0.69314718055994530942);
> > +  const fixed_pt_t LG[7] = {
> > +    fixed_const(6.666666666666735130e-01),
> > +    fixed_const(3.999999999940941908e-01),
> > +    fixed_const(2.857142874366239149e-01),
> > +    fixed_const(2.222219843214978396e-01),
> > +    fixed_const(1.818357216161805012e-01),
> > +    fixed_const(1.531383769920937332e-01),
> > +    fixed_const(1.479819860511658591e-01)
> > +  };
> > +
> > +  if (x == 0)
> > +    {
> > +      return 0xffffffff;
> > +    }
> > +
> > +  fixed_pt_t log2 = 0;
> > +  fixed_pt_t xi = x;
> > +  while (xi > FIXED_PT_TWO)
> > +    {
> > +      xi >>= 1;
> > +      log2++;
> > +    }
> > +
> > +  fixed_pt_t f = xi - FIXED_PT_ONE;
> > +  fixed_pt_t s = fixed_div(f, FIXED_PT_TWO + f);
> > +  fixed_pt_t z = fixed_mul(s, s);
> > +  fixed_pt_t w = fixed_mul(z, z);
> > +  fixed_pt_t R =
> > +    fixed_mul(w, LG[1] + fixed_mul(w, LG[3] + fixed_mul(w, LG[5])))
> > +    + fixed_mul(z, LG[0] + fixed_mul(w, LG[2]
> > +        + fixed_mul(w, LG[4] + fixed_mul(w, LG[6]))));
> > +  return fixed_mul(LN2, (log2 << FIXED_PT_FRAC_WIDTH))
> > +    + f - fixed_mul(s, f - R);
> > +}
> > +
> > +// State not synchronized; assumes single thread usage at any time.
> > +// Invoked only by the thread reoptimizing, so assumption holds.
> > +int
> > +obtainRandomInt(int max)
> > +{
> > +  static int seed = 123456789;
> > +  seed = (1103515245 * seed + 12345) % 4294967296;
> > +  return seed;
> > +}
>
> Please change this into a simple struct holding the seed and put it into
> common.h for now.  No CamelCase.  You should also use uint32_t when
> using constants like that, and there's no need for the modulo.
> Also add a quick comment on which RNG this is (and possibly the source
> of the constants)?
>

Will do.

> > +
> > +// Called by the thread at the tail of the list of threads.
> > +void
> > +GTM::gtm_thread::reoptimize_htm_execution()
> > +{
> > +  gtm_thread *tx = gtm_thr();
> > +  uint64_t total_txs_executed = 0;
> > +
> > +  // Collect the statistics obtained so far.
> > +  serial_lock.read_lock(tx);
> > +  gtm_thread *ptr = list_of_threads;
> > +  for (; ptr; ptr = ptr->next_thread)
> > +    {
> > +      total_txs_executed += ptr->number_executed_txns;
> > +    }
> > +  serial_lock.read_unlock(tx);
> > +
> > +  // Obtain the delta performance with respect to the last period.
> > +  const uint64_t current_cycles = rdtsc();
> > +  const uint64_t cycles_used = current_cycles - optimizer.last_cycles;
> > +  const uint64_t new_txs_executed =
> > +    total_txs_executed - optimizer.last_total_txs_executed;
> > +  optimizer.last_cycles = current_cycles;
> > +  optimizer.last_total_txs_executed = total_txs_executed;
> > +  if (optimizer.optimized_mode == STUBBORN)
> > +    {
> > +      optimizer.txns_while_stubborn += new_txs_executed;
> > +      optimizer.cycles_while_stubborn += cycles_used;
> > +    }
> > +  else if (optimizer.optimized_mode == HALVEN)
> > +    {
> > +      optimizer.txns_while_halven += new_txs_executed;
> > +      optimizer.cycles_while_halven += cycles_used;
> > +    }
> > +  else
> > +    {
> > +      optimizer.txns_while_giveup += new_txs_executed;
> > +      optimizer.cycles_while_giveup += cycles_used;
> > +    }
> > +
> > +  // Compute Upper Confidence Bounds for the mode to choose next.
> > +  // Use fixed point arithmetic types to spare floating point usage.
> > +  const fixed_pt_t log_sum =
> > +    fixed_mul(FIXED_PT_TWO,
> > +      fixed_ln(fixed_const(optimizer.txns_while_stubborn)
> > +               + fixed_const(optimizer.txns_while_halven)
> > +               + fixed_const(optimizer.txns_while_giveup)));
> > +  const fixed_pt_t ucb_stubborn =
> > +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_stubborn),
> > +                        fixed_const(optimizer.cycles_while_stubborn)),
> > +              optimizer.best_ever_throughput)
> > +    + fixed_sqrt(fixed_div(log_sum,
> > +                           fixed_const(optimizer.txns_while_stubborn)));
> > +  const fixed_pt_t ucb_halven =
> > +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_halven),
> > +                        fixed_const(optimizer.cycles_while_halven)),
> > +              optimizer.best_ever_throughput)
> > +    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_halven)));
> > +  const fixed_pt_t ucb_giveup =
> > +    fixed_div(fixed_div(fixed_const(optimizer.txns_while_giveup),
> > +                        fixed_const(optimizer.cycles_while_giveup)),
> > +              optimizer.best_ever_throughput)
> > +    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_giveup)));
> > +
> > +  if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup)
> > +    {
> > +      optimizer.optimized_mode = STUBBORN;
> > +    }
> > +  else if (ucb_halven > ucb_giveup)
> > +    {
> > +      optimizer.optimized_mode = HALVEN;
> > +    }
> > +  else
> > +    {
> > +      optimizer.optimized_mode = GIVEUP;
> > +    }
> > +
> > +  // Compute gradient descent for the number of retries.
> > +  const fixed_pt_t current_throughput =
> > +    fixed_div(fixed_const(new_txs_executed), fixed_const(cycles_used));
> > +  const fixed_pt_t change_for_better =
> > +    fixed_div(current_throughput, optimizer.last_throughput);
> > +  const fixed_pt_t change_for_worse =
> > +    fixed_div(optimizer.last_throughput, current_throughput);
> > +  int32_t last_attempts = optimizer.last_attempts;
> > +  int32_t current_attempts = optimizer.optimized_attempts;
> > +  int32_t new_attempts = current_attempts;
> > +  if (unlikely(change_for_worse > fixed_const(1.40)))
> > +    {
> > +      optimizer.optimized_attempts = optimizer.best_ever_attempts;
> > +      optimizer.last_throughput = current_throughput;
> > +      optimizer.last_attempts = current_attempts;
> > +      return;
> > +    }
> > +
> > +  if (unlikely(obtainRandomInt(100) == 0))
> > +    {
> > +      optimizer.last_attempts = optimizer.optimized_attempts;
> > +      optimizer.last_throughput = current_throughput;
> > +      optimizer.optimized_attempts = obtainRandomInt(18) + 2;
> > +      return;
> > +    }
> > +
> > +      if (change_for_better > fixed_const(1.05))
>
> Indentation is off.  Perhaps due to badly mixed space/tab indentation?

Will check.

>
> > +        {
> > +          new_attempts += current_attempts - last_attempts;
> > +          if (current_attempts == last_attempts)
> > +            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
> > +        }
> > +      else if (change_for_worse > fixed_const(1.05))
> > +        {
> > +          new_attempts -= current_attempts - last_attempts;
> > +          if (current_attempts == last_attempts)
> > +            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
> > +        }
> > +      if (unlikely(new_attempts > 20))
> > +        new_attempts = 20;
> > +      else if (unlikely(new_attempts < 2))
> > +        new_attempts = 2;
> > +      if (current_attempts != new_attempts)
> > +        {
> > +          optimizer.last_attempts = current_attempts;
> > +          optimizer.last_throughput = current_throughput;
> > +        }
> > +      optimizer.optimized_attempts = new_attempts;
> > +      if (unlikely(current_throughput > optimizer.best_ever_throughput))
> > +        {
> > +          optimizer.best_ever_throughput = current_throughput;
> > +          optimizer.best_ever_attempts = current_attempts;
> > +        }
> > +}
>
> I'll look at this one once we've finalized the tuning algorithm.
>
> > +
> >  uint32_t
> >  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
> >  {
> > @@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
> >  #ifndef HTM_CUSTOM_FASTPATH
> >    if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
> >      {
> > -      for (uint32_t t = htm_fastpath; t; t--)
> > +      for (uint32_t t = optimizer.optimized_attempts; t; t--)
>
> Why this change?  Couldn't you keep htm_fastpath as the globally set
> number of attempts?

The idea was to keep the variables tuned within the optimizer struct. Hence,
the optimized_attempts replaced the role of the htm_fastpath whenever the
tuning is enabled. We can keep it in sync with the optimizer, though, by making
it additionally assigned during the re-optimization with the final value of the
optimized_attempts.


>
> >   {
> >    uint32_t ret = htm_begin();
> >    if (htm_begin_success(ret))
> > @@ -209,19 +430,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
> >      }
> >    // The transaction has aborted.  Don't retry if it's unlikely that
> >    // retrying the transaction will be successful.
> > -  if (!htm_abort_should_retry(ret))
> > +#ifdef HAVE_AS_RTM
> > +          if (htm_is_capacity_abort(ret))
> > +            {
> > +              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
> > +              if (capacity_mode == HALVEN)
> > +                t = t / 2 + 1;
> > +              else if (capacity_mode == GIVEUP)
> > +                t = 1;
> > +            }
> > +          // Only one thread performs this to avoid the need for
> > +          // synchronization. We use the oldest thread that is active.
> > +          // We also reoptimize at most once per transaction.
> > +          if (unlikely(tx->next_thread == NULL &&
> > +              tx->number_executed_txns % 500 == 0 && t == 1))
> > +              reoptimize_htm_execution();
> > +#else
> > +          if (!htm_abort_should_retry(ret))
> >      break;
> > +#endif
> >    // Wait until any concurrent serial-mode transactions have finished.
> >    // This is an empty critical section, but won't be elided.
> >    if (serial_lock.is_write_locked())
> >      {
> > -      tx = gtm_thr();
> > -      if (unlikely(tx == NULL))
> > -        {
> > -          // See below.
> > -          tx = new gtm_thread();
> > -          set_gtm_thr(tx);
> > -        }
> > +              tx = gtm_thr();
> > +              if (unlikely(tx == NULL))
> > +                {
> > +                  // See below.
> > +                  tx = new gtm_thread();
> > +                  set_gtm_thr(tx);
> > +                }
>
> You seemed to have changed just tabs to whitespace.

Will check.

>
> >        // Check whether there is an enclosing serial-mode transaction;
> >        // if so, we just continue as a nested transaction and don't
> >        // try to use the HTM fastpath.  This case can happen when an
> > @@ -262,12 +500,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
> >        // other fallback will use serial transactions, which don't use
> >        // restart_total but will reset it when committing.
> >        if (!(prop & pr_HTMRetriedAfterAbort))
> > - tx->restart_total = htm_fastpath;
> > +        tx->restart_total = optimizer.optimized_attempts;
> >
> >        if (--tx->restart_total > 0)
> >   {
> >    // Wait until any concurrent serial-mode transactions have finished.
> >    // Essentially the same code as above.
>
> This comment belongs to the code below.

Ack

>
> > +#ifdef HAVE_AS_RTM
> > +          if (prop & pr_HTMCapacityAbort)
> > +            {
> > +              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
> > +              if (capacity_mode == HALVEN)
> > +                tx->restart_total = tx->restart_total;
>
> That's a no-op.  If you measured with this code, it seems you never
> actually tried the HALVEN strategy.  Does that mean we don't need
> HALVEN, or do you get other results now?

That's definitely a typo (missing the division, this was changed since the
fixed point change I believe). The halven mode is definitely not a big winner
in my experiments. Still, there are cases where I measured it to be the
optimal. As the effort/complexity of keeping it is not much, I always opted
to do so.


>
> > +              else if (capacity_mode == GIVEUP)
> > +                goto stop_custom_htm_fastpath;
> > +            }
> > +
> > +          if (unlikely(tx->next_thread == NULL &&
> > +              tx->number_executed_txns % 500 == 0 && tx->restart_total == 1))
> > +              reoptimize_htm_execution();
> > +#endif
> >    if (serial_lock.is_write_locked())
> >      {
> >        if (tx->nesting > 0)
> > @@ -665,12 +917,21 @@ _ITM_commitTransaction(void)
> >    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
> >      {
> >        htm_commit();
> > +      gtm_thread *tx = gtm_thr();
> > +      if (unlikely(tx == NULL))
> > +        {
> > +          tx = new gtm_thread();
> > +          set_gtm_thr(tx);
> > +        }
>
> Are there actually situations in which we need to create a gtm_thread?

I did not check in run-time; statically, I followed the approach to
check, which
is quite omnipresent in the code.

>
> > +      tx->number_executed_txns++;
>
> It might be nice if we can avoid this, so that we don't touch the
> additional cacheline when we have nested txns.

This is really important to have some way to measure success/progress
to guide the tuning.


>
> >        return;
> >      }
> >  #endif
> >    gtm_thread *tx = gtm_thr();
> >    if (!tx->trycommit ())
> >      tx->restart (RESTART_VALIDATE_COMMIT);
> > +
> > +  tx->number_executed_txns++;
> >  }
> >
> >  void ITM_REGPARM
> > @@ -681,6 +942,13 @@ _ITM_commitTransactionEH(void *exc_ptr)
> >    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
> >      {
> >        htm_commit();
> > +      gtm_thread *tx = gtm_thr();
> > +      if (unlikely(tx == NULL))
> > +        {
> > +          tx = new gtm_thread();
> > +          set_gtm_thr(tx);
> > +        }
> > +      tx->number_executed_txns++;
> >        return;
> >      }
> >  #endif
> > @@ -690,4 +958,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
> >        tx->eh_in_flight = exc_ptr;
> >        tx->restart (RESTART_VALIDATE_COMMIT);
> >      }
> > +  tx->number_executed_txns++;
> >  }
> > Index: libitm/config/x86/target.h
> > ===================================================================
> > --- libitm/config/x86/target.h (revision 219316)
> > +++ libitm/config/x86/target.h (working copy)
> > @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
> >    return begin_ret & _XABORT_RETRY;
> >  }
> >
> > +static inline bool
> > +htm_is_capacity_abort(uint32_t begin_ret)
> > +{
> > +  return begin_ret & _XABORT_CAPACITY;
> > +}
>
> Is this indeed just about capacity, or do we actually mean a more
> general situation?

The idea of the Upper Confidence Bounds (UCB) part of the algorithm was
to detect "real" capacity aborts (i.e., permanent) vs those that are
"transient".
So in this case we really wanted to know if the abort was a capacity one so
that we could direct the UCB algorithm and understand the "type" of capacity
abort.



>
> > +
> >  /* Returns true iff a hardware transaction is currently being executed.  */
> >  static inline bool
> >  htm_transaction_active ()
> >  {
> >    return _xtest() != 0;
> >  }
> > +
> > +static inline uint64_t rdtsc()
> > +{
> > +    uint32_t hi, lo;
> > +    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
> > +    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
> > +}
> >  #endif
> >
> >
> > Index: libitm/config/x86/sjlj.S
> > ===================================================================
> > --- libitm/config/x86/sjlj.S (revision 219316)
> > +++ libitm/config/x86/sjlj.S (working copy)
> > @@ -59,12 +59,14 @@
> >  #define pr_hasNoAbort 0x08
> >  #define pr_HTMRetryableAbort 0x800000
> >  #define pr_HTMRetriedAfterAbort 0x1000000
> > +#define pr_HTMCapacityAbort     0x2000000
> >  #define a_runInstrumentedCode 0x01
> >  #define a_runUninstrumentedCode 0x02
> >  #define a_tryHTMFastPath 0x20
> >
> >  #define _XABORT_EXPLICIT (1 << 0)
> >  #define _XABORT_RETRY (1 << 1)
> > +#define _XABORT_CAPACITY (1 << 3)
> >
> >   .text
> >
> > @@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction):
> >  .Ltxn_abort:
> >   /* If it might make sense to retry the HTM fast path, let the C++
> >     code decide.  */
> > - testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
> > + testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax
> >   jz .Lno_htm
> >   orl $pr_HTMRetryableAbort, %edi
> > + testl   $(_XABORT_CAPACITY), %eax
> > + jz      .Lno_htm
> > + orl     $pr_HTMCapacityAbort, %edi
> >   /* Let the C++ code handle the retry policy.  */
> >  .Lno_htm:
> >  #endif
>
>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-05-19  4:54               ` Nuno Diegues
@ 2015-05-19 22:31                 ` Torvald Riegel
  2015-06-14 21:49                   ` Nuno Diegues
  0 siblings, 1 reply; 20+ messages in thread
From: Torvald Riegel @ 2015-05-19 22:31 UTC (permalink / raw)
  To: Nuno Diegues; +Cc: Andi Kleen, gcc-patches, Paolo Romano

On Mon, 2015-05-18 at 23:27 -0400, Nuno Diegues wrote:
> On Mon, May 18, 2015 at 5:29 PM, Torvald Riegel <triegel@redhat.com> wrote:
> >
> > Are there better options for the utility function, or can we tune it to
> > be less affected by varying txn length and likelihood of txnal vs.
> > nontxnal code?  What are the things we want to avoid with the tuning?  I
> > can think of:
> > * Not needing to wait for serial txns, or being aborted by a serial txn.
> > * Not retrying using HTM too much so that the retry time is larger than
> > the scalability we actually gain by having other txns commit
> > concurrently.
> 
> 
> Yes, those are the key points we want to make sure that do not happen.
> 
> >
> >
> > Anything else?  Did you consider other utility functions during your
> > research?
> 
> 
> The txnal vs nontxnal is indeed a completely different story. To account for
> this we would need extra book-keeping to count only cycles spent inside
> txnal code. So this would require every thread (or a sample of threads) to
> perform a rdtsc (or equivalent) on every begin/end call rather than the
> current approach of a single rdtsc per optimization round.
> 
> With this type of online optimization we found that the algorithm had to be
> very simple and cheap to execute. RDTSC was a good finding to fit this, and
> it enabled us to obtain gains. Other time sources failed to do so.
> 
> I do not have, out of the box, a good alternative to offer. I suppose it would
> take some iterations of thinking/experimenting with, just like with any research
> problem :)

So let's iterate on this in parallel with the other changes that we need
to get in place.  I'd prefer to have some more confidence that measuring
txn throughput in epochs is the best way forward.

Here are a few thoughts:

Why do we actually need to measure succeeded transactions?  If a HW txn
is just getting committed without interfering with anything else, is
this really different from, say, two HW txns getting committed?  Aren't
we really just interested in wasted work, or delayed txns?  That may
help taking care of the nontxnal vs. txnal problem.

Measuring committed txns during a time that might otherwise be spent by
a serial txns could be useful to figure out how much other useful work a
serial txn prevents.  But we'd see that as well if we'd just go serial
during the auto-tuning because then concurrent txns will have to wait;
and in this case we could measure it in the slow path of those
concurrent txns (ie, during waiting, where measurement overhead wouldn't
matter as much).

If a serial txn is running, concurrent txns (that wait) are free to sync
and tell the serial how much cost it creates for the concurrent txns.
There, txn length could matter, but we won't find out for real until
after the concurrent ones have run (they could be pretty short, so we
can't simply assume that the concurrent ones are as long as the serial
one, so that simply the number of concurrent ones can be used to
calculate delayed work).

> 
> > Also, note that the mitigation strategy for rdtsc
> > short-comings that you mention in the paper is not applicable in
> > general, specifically not in libitm.
> 
> 
> I suppose you mean the preemption of threads inflating the cycles measured?

Yes, preemption and migration of threads (in case there's no global
sync'ed TSC or similar) -- you mentioned in the paper that you pin
threads to cores...

> This would be similarly a problem to any time source that tries to measure the
> amount of work performed; not sure how we can avoid it in general. Any thoughts?

Not really as long as we keep depending on measuring time in a
light-weight way.  Measuring smaller time intervals could make it less
likely that preemption happens during such an interval, though.

> 
> > Another issue is that we need to implement the tuning in a portable way.
> > You currently make it depend on whether the assembler knows about RTM,
> > whereas the rest of the code makes this more portable and defers to
> > arch-specific files.  I'd prefer if we could use the tuning on other
> > archs too.  But for that, we need to cleanly separate generic from
> > arch-specific parts.  That affects magic numbers as well as things like
> > rdtsc().
> 
> 
> Yes, I refrained from adding new calls to the arch-specific files, to
> contain the
> changes mainly. But that is possible and that's part of the feedback I
> was hoping
> to get.

OK.  Let me know if you want further input regarding this.

> > I'm wondering about whether it really makes sense to treat XABORT like
> > conflicts and other abort reasons, instead of like capacity aborts.
> > Perhaps we need to differentiate between the explicit abort codes glibc
> > uses, so that we can distinguish between cases where an abort is
> > supposed to signal incompatibility with txnal execution and cases where
> > it's just used for lock elision (see sysdeps/unix/sysv/linux/x86/hle.h
> > in current glibc):
> > #define _ABORT_LOCK_BUSY        0xff
> > #define _ABORT_LOCK_IS_LOCKED   0xfe
> > #define _ABORT_NESTED_TRYLOCK   0xfd
> 
> 
> I am not sure I follow: are you suggesting to consider other aborts besides
> those of capacity?

We may want to do this with UCB, similar to capacity.  Another option
would be to, for now, make fixed choices regarding whether they are
considered permanent or not.
_ABORT_NESTED_TRYLOCK is an incompatibility with txnal execution
(similar to illegal instructions and such).
_ABORT_LOCK_BUSY is failing lock elision due to the lock being already
acquired; thus, this is transient I'd say.

Andi, what was _ABORT_LOCK_IS_LOCKED used for again?

> > Finally, please pay attention to using the same leading tabs/spaces
> > style as current libitm code; it could be just your MUA, but it seems
> > that your patch uses just leading spaces.  libitm uses 8-char tabs for
> > leading space.
> 
> 
> Now that I know the style, I will enforce it ;)
> Previously I had trusted my editor to follow the current style, but maybe
> it was not consistent everywhere, or it just got confused.

Thanks :)

> >
> >
> > > +    optimizer.optimized_mode = STUBBORN;
> > > +    optimizer.optimized_attempts = htm_fastpath;
> > > +    optimizer.last_cycles = rdtsc();
> > > +    optimizer.last_total_txs_executed = 0;
> > > +    optimizer.last_throughput = fixed_const(0.0001);
> > > +    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
> > > +    optimizer.best_ever_throughput = optimizer.last_throughput;
> > > +    optimizer.best_ever_attempts = htm_fastpath;
> > > +    optimizer.txns_while_stubborn = 1;
> > > +    optimizer.cycles_while_stubborn = 100;
> >
> > If the assumption is that transactions can't be faster than 100 cycles,
> > please document it briefly.
> 
> 
> No, it does not imply that. This is an "accumulator" of cycles spent in
> the mode; it need only to be positive as far as I remember.

OK.  Please find out what the requirement is and document it (magic
numbers...).

> > > --- libitm/beginend.cc (revision 219316)
> > > +++ libitm/beginend.cc (working copy)
> > > @@ -25,7 +25,6 @@
> > >  #include "libitm_i.h"
> > >  #include <pthread.h>
> > >
> > > -
> >
> > Minor: just drop this chunk.
> 
> 
> What is this referring to?

This part of the patch where all you do is delete a line.

> > > @@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
> > >  #ifndef HTM_CUSTOM_FASTPATH
> > >    if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
> > >      {
> > > -      for (uint32_t t = htm_fastpath; t; t--)
> > > +      for (uint32_t t = optimizer.optimized_attempts; t; t--)
> >
> > Why this change?  Couldn't you keep htm_fastpath as the globally set
> > number of attempts?
> 
> The idea was to keep the variables tuned within the optimizer struct. Hence,
> the optimized_attempts replaced the role of the htm_fastpath whenever the
> tuning is enabled. We can keep it in sync with the optimizer, though, by making
> it additionally assigned during the re-optimization with the final value of the
> optimized_attempts.

I'd prefer the latter.  If we end up having other configurations that
use HTM but don't use the optimizer, it would be awkward to have the
same logical setting in two locations depending on the configuration
(ie, htm_fastpath or optimized_attempts).

Also, even though we don't do this yet, it may be easier to co-locate
the separate htm_fastpath value with something else (e.g., the serial
lock), so that we can save a cacheline of capacity in case of nested
txns.  htm_fastpath is one of the few things that a HW txn must access.

> > > @@ -665,12 +917,21 @@ _ITM_commitTransaction(void)
> > >    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
> > >      {
> > >        htm_commit();
> > > +      gtm_thread *tx = gtm_thr();
> > > +      if (unlikely(tx == NULL))
> > > +        {
> > > +          tx = new gtm_thread();
> > > +          set_gtm_thr(tx);
> > > +        }
> >
> > Are there actually situations in which we need to create a gtm_thread?
> 
> I did not check in run-time; statically, I followed the approach to
> check, which
> is quite omnipresent in the code.

Yeah, it's necessary.  Can you add a comment along the lines of:
When using the HTM fastpath, we might not have created a thread object
yet.

> >
> > > +      tx->number_executed_txns++;
> >
> > It might be nice if we can avoid this, so that we don't touch the
> > additional cacheline when we have nested txns.
> 
> This is really important to have some way to measure success/progress
> to guide the tuning.

I'm aware of that -- but is there a way to count differently (see the
utility function point above), or just count failures so that the actual
counting happens on a slow path (eg, after abort)?

> > > --- libitm/config/x86/target.h (revision 219316)
> > > +++ libitm/config/x86/target.h (working copy)
> > > @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
> > >    return begin_ret & _XABORT_RETRY;
> > >  }
> > >
> > > +static inline bool
> > > +htm_is_capacity_abort(uint32_t begin_ret)
> > > +{
> > > +  return begin_ret & _XABORT_CAPACITY;
> > > +}
> >
> > Is this indeed just about capacity, or do we actually mean a more
> > general situation?
> 
> The idea of the Upper Confidence Bounds (UCB) part of the algorithm was
> to detect "real" capacity aborts (i.e., permanent) vs those that are
> "transient".
> So in this case we really wanted to know if the abort was a capacity one so
> that we could direct the UCB algorithm and understand the "type" of capacity
> abort.

OK.  But do you want to single out capacity aborts as one case for which
you want to detect permanent vs. transient, or do you want to generally
find out whether aborts that are reported as permanent (e.g., capacity)
are indeed permanent?  In the latter case, a more generic name would be
more appropriate.  (And that's not just a naming thing, but guidance for
other archs regarding what they should put in the "capacity" bucket of
abort reasons...)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-05-19 22:31                 ` Torvald Riegel
@ 2015-06-14 21:49                   ` Nuno Diegues
  2015-08-24 12:18                     ` Nuno Diegues
  0 siblings, 1 reply; 20+ messages in thread
From: Nuno Diegues @ 2015-06-14 21:49 UTC (permalink / raw)
  To: Torvald Riegel; +Cc: Andi Kleen, gcc-patches, Paolo Romano

Hello everyone,

just wanted to ping back to say that I have been overwhelmed with work
and will be back on this as soon as possible, most likely during July.


Best regards,
-- Nuno Diegues

On Tue, May 19, 2015 at 3:17 PM, Torvald Riegel <triegel@redhat.com> wrote:
> On Mon, 2015-05-18 at 23:27 -0400, Nuno Diegues wrote:
>> On Mon, May 18, 2015 at 5:29 PM, Torvald Riegel <triegel@redhat.com> wrote:
>> >
>> > Are there better options for the utility function, or can we tune it to
>> > be less affected by varying txn length and likelihood of txnal vs.
>> > nontxnal code?  What are the things we want to avoid with the tuning?  I
>> > can think of:
>> > * Not needing to wait for serial txns, or being aborted by a serial txn.
>> > * Not retrying using HTM too much so that the retry time is larger than
>> > the scalability we actually gain by having other txns commit
>> > concurrently.
>>
>>
>> Yes, those are the key points we want to make sure that do not happen.
>>
>> >
>> >
>> > Anything else?  Did you consider other utility functions during your
>> > research?
>>
>>
>> The txnal vs nontxnal is indeed a completely different story. To account for
>> this we would need extra book-keeping to count only cycles spent inside
>> txnal code. So this would require every thread (or a sample of threads) to
>> perform a rdtsc (or equivalent) on every begin/end call rather than the
>> current approach of a single rdtsc per optimization round.
>>
>> With this type of online optimization we found that the algorithm had to be
>> very simple and cheap to execute. RDTSC was a good finding to fit this, and
>> it enabled us to obtain gains. Other time sources failed to do so.
>>
>> I do not have, out of the box, a good alternative to offer. I suppose it would
>> take some iterations of thinking/experimenting with, just like with any research
>> problem :)
>
> So let's iterate on this in parallel with the other changes that we need
> to get in place.  I'd prefer to have some more confidence that measuring
> txn throughput in epochs is the best way forward.
>
> Here are a few thoughts:
>
> Why do we actually need to measure succeeded transactions?  If a HW txn
> is just getting committed without interfering with anything else, is
> this really different from, say, two HW txns getting committed?  Aren't
> we really just interested in wasted work, or delayed txns?  That may
> help taking care of the nontxnal vs. txnal problem.
>
> Measuring committed txns during a time that might otherwise be spent by
> a serial txns could be useful to figure out how much other useful work a
> serial txn prevents.  But we'd see that as well if we'd just go serial
> during the auto-tuning because then concurrent txns will have to wait;
> and in this case we could measure it in the slow path of those
> concurrent txns (ie, during waiting, where measurement overhead wouldn't
> matter as much).
>
> If a serial txn is running, concurrent txns (that wait) are free to sync
> and tell the serial how much cost it creates for the concurrent txns.
> There, txn length could matter, but we won't find out for real until
> after the concurrent ones have run (they could be pretty short, so we
> can't simply assume that the concurrent ones are as long as the serial
> one, so that simply the number of concurrent ones can be used to
> calculate delayed work).
>
>>
>> > Also, note that the mitigation strategy for rdtsc
>> > short-comings that you mention in the paper is not applicable in
>> > general, specifically not in libitm.
>>
>>
>> I suppose you mean the preemption of threads inflating the cycles measured?
>
> Yes, preemption and migration of threads (in case there's no global
> sync'ed TSC or similar) -- you mentioned in the paper that you pin
> threads to cores...
>
>> This would be similarly a problem to any time source that tries to measure the
>> amount of work performed; not sure how we can avoid it in general. Any thoughts?
>
> Not really as long as we keep depending on measuring time in a
> light-weight way.  Measuring smaller time intervals could make it less
> likely that preemption happens during such an interval, though.
>
>>
>> > Another issue is that we need to implement the tuning in a portable way.
>> > You currently make it depend on whether the assembler knows about RTM,
>> > whereas the rest of the code makes this more portable and defers to
>> > arch-specific files.  I'd prefer if we could use the tuning on other
>> > archs too.  But for that, we need to cleanly separate generic from
>> > arch-specific parts.  That affects magic numbers as well as things like
>> > rdtsc().
>>
>>
>> Yes, I refrained from adding new calls to the arch-specific files, to
>> contain the
>> changes mainly. But that is possible and that's part of the feedback I
>> was hoping
>> to get.
>
> OK.  Let me know if you want further input regarding this.
>
>> > I'm wondering about whether it really makes sense to treat XABORT like
>> > conflicts and other abort reasons, instead of like capacity aborts.
>> > Perhaps we need to differentiate between the explicit abort codes glibc
>> > uses, so that we can distinguish between cases where an abort is
>> > supposed to signal incompatibility with txnal execution and cases where
>> > it's just used for lock elision (see sysdeps/unix/sysv/linux/x86/hle.h
>> > in current glibc):
>> > #define _ABORT_LOCK_BUSY        0xff
>> > #define _ABORT_LOCK_IS_LOCKED   0xfe
>> > #define _ABORT_NESTED_TRYLOCK   0xfd
>>
>>
>> I am not sure I follow: are you suggesting to consider other aborts besides
>> those of capacity?
>
> We may want to do this with UCB, similar to capacity.  Another option
> would be to, for now, make fixed choices regarding whether they are
> considered permanent or not.
> _ABORT_NESTED_TRYLOCK is an incompatibility with txnal execution
> (similar to illegal instructions and such).
> _ABORT_LOCK_BUSY is failing lock elision due to the lock being already
> acquired; thus, this is transient I'd say.
>
> Andi, what was _ABORT_LOCK_IS_LOCKED used for again?
>
>> > Finally, please pay attention to using the same leading tabs/spaces
>> > style as current libitm code; it could be just your MUA, but it seems
>> > that your patch uses just leading spaces.  libitm uses 8-char tabs for
>> > leading space.
>>
>>
>> Now that I know the style, I will enforce it ;)
>> Previously I had trusted my editor to follow the current style, but maybe
>> it was not consistent everywhere, or it just got confused.
>
> Thanks :)
>
>> >
>> >
>> > > +    optimizer.optimized_mode = STUBBORN;
>> > > +    optimizer.optimized_attempts = htm_fastpath;
>> > > +    optimizer.last_cycles = rdtsc();
>> > > +    optimizer.last_total_txs_executed = 0;
>> > > +    optimizer.last_throughput = fixed_const(0.0001);
>> > > +    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
>> > > +    optimizer.best_ever_throughput = optimizer.last_throughput;
>> > > +    optimizer.best_ever_attempts = htm_fastpath;
>> > > +    optimizer.txns_while_stubborn = 1;
>> > > +    optimizer.cycles_while_stubborn = 100;
>> >
>> > If the assumption is that transactions can't be faster than 100 cycles,
>> > please document it briefly.
>>
>>
>> No, it does not imply that. This is an "accumulator" of cycles spent in
>> the mode; it need only to be positive as far as I remember.
>
> OK.  Please find out what the requirement is and document it (magic
> numbers...).
>
>> > > --- libitm/beginend.cc (revision 219316)
>> > > +++ libitm/beginend.cc (working copy)
>> > > @@ -25,7 +25,6 @@
>> > >  #include "libitm_i.h"
>> > >  #include <pthread.h>
>> > >
>> > > -
>> >
>> > Minor: just drop this chunk.
>>
>>
>> What is this referring to?
>
> This part of the patch where all you do is delete a line.
>
>> > > @@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>> > >  #ifndef HTM_CUSTOM_FASTPATH
>> > >    if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
>> > >      {
>> > > -      for (uint32_t t = htm_fastpath; t; t--)
>> > > +      for (uint32_t t = optimizer.optimized_attempts; t; t--)
>> >
>> > Why this change?  Couldn't you keep htm_fastpath as the globally set
>> > number of attempts?
>>
>> The idea was to keep the variables tuned within the optimizer struct. Hence,
>> the optimized_attempts replaced the role of the htm_fastpath whenever the
>> tuning is enabled. We can keep it in sync with the optimizer, though, by making
>> it additionally assigned during the re-optimization with the final value of the
>> optimized_attempts.
>
> I'd prefer the latter.  If we end up having other configurations that
> use HTM but don't use the optimizer, it would be awkward to have the
> same logical setting in two locations depending on the configuration
> (ie, htm_fastpath or optimized_attempts).
>
> Also, even though we don't do this yet, it may be easier to co-locate
> the separate htm_fastpath value with something else (e.g., the serial
> lock), so that we can save a cacheline of capacity in case of nested
> txns.  htm_fastpath is one of the few things that a HW txn must access.
>
>> > > @@ -665,12 +917,21 @@ _ITM_commitTransaction(void)
>> > >    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
>> > >      {
>> > >        htm_commit();
>> > > +      gtm_thread *tx = gtm_thr();
>> > > +      if (unlikely(tx == NULL))
>> > > +        {
>> > > +          tx = new gtm_thread();
>> > > +          set_gtm_thr(tx);
>> > > +        }
>> >
>> > Are there actually situations in which we need to create a gtm_thread?
>>
>> I did not check in run-time; statically, I followed the approach to
>> check, which
>> is quite omnipresent in the code.
>
> Yeah, it's necessary.  Can you add a comment along the lines of:
> When using the HTM fastpath, we might not have created a thread object
> yet.
>
>> >
>> > > +      tx->number_executed_txns++;
>> >
>> > It might be nice if we can avoid this, so that we don't touch the
>> > additional cacheline when we have nested txns.
>>
>> This is really important to have some way to measure success/progress
>> to guide the tuning.
>
> I'm aware of that -- but is there a way to count differently (see the
> utility function point above), or just count failures so that the actual
> counting happens on a slow path (eg, after abort)?
>
>> > > --- libitm/config/x86/target.h (revision 219316)
>> > > +++ libitm/config/x86/target.h (working copy)
>> > > @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
>> > >    return begin_ret & _XABORT_RETRY;
>> > >  }
>> > >
>> > > +static inline bool
>> > > +htm_is_capacity_abort(uint32_t begin_ret)
>> > > +{
>> > > +  return begin_ret & _XABORT_CAPACITY;
>> > > +}
>> >
>> > Is this indeed just about capacity, or do we actually mean a more
>> > general situation?
>>
>> The idea of the Upper Confidence Bounds (UCB) part of the algorithm was
>> to detect "real" capacity aborts (i.e., permanent) vs those that are
>> "transient".
>> So in this case we really wanted to know if the abort was a capacity one so
>> that we could direct the UCB algorithm and understand the "type" of capacity
>> abort.
>
> OK.  But do you want to single out capacity aborts as one case for which
> you want to detect permanent vs. transient, or do you want to generally
> find out whether aborts that are reported as permanent (e.g., capacity)
> are indeed permanent?  In the latter case, a more generic name would be
> more appropriate.  (And that's not just a naming thing, but guidance for
> other archs regarding what they should put in the "capacity" bucket of
> abort reasons...)
>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-06-14 21:49                   ` Nuno Diegues
@ 2015-08-24 12:18                     ` Nuno Diegues
  2015-11-02  8:58                       ` Nuno Diegues
  0 siblings, 1 reply; 20+ messages in thread
From: Nuno Diegues @ 2015-08-24 12:18 UTC (permalink / raw)
  To: Torvald Riegel; +Cc: Andi Kleen, gcc-patches, Paolo Romano

Hello everyone,

after a summer internship and some backlog catching up in the past
weeks, I have finally got around to review the patch according to the
latest discussion.

The changes have not caused regression, and the latest speedup results
are coherent with what we had before.


In the following I add some comments inline to the discussion, and
after that paste the updated patch.


> >
> > So let's iterate on this in parallel with the other changes that we need
> > to get in place.  I'd prefer to have some more confidence that measuring
> > txn throughput in epochs is the best way forward.
> >
> > Here are a few thoughts:
> >
> > Why do we actually need to measure succeeded transactions?  If a HW txn
> > is just getting committed without interfering with anything else, is
> > this really different from, say, two HW txns getting committed?  Aren't
> > we really just interested in wasted work, or delayed txns?  That may
> > help taking care of the nontxnal vs. txnal problem.
> >
> > Measuring committed txns during a time that might otherwise be spent by
> > a serial txns could be useful to figure out how much other useful work a
> > serial txn prevents.  But we'd see that as well if we'd just go serial
> > during the auto-tuning because then concurrent txns will have to wait;
> > and in this case we could measure it in the slow path of those
> > concurrent txns (ie, during waiting, where measurement overhead wouldn't
> > matter as much).
> >
> > If a serial txn is running, concurrent txns (that wait) are free to sync
> > and tell the serial how much cost it creates for the concurrent txns.
> > There, txn length could matter, but we won't find out for real until
> > after the concurrent ones have run (they could be pretty short, so we
> > can't simply assume that the concurrent ones are as long as the serial
> > one, so that simply the number of concurrent ones can be used to
> > calculate delayed work).


I understand you concern: measuring failure in a slow path rather than
success in
the fast/common path.

My problem is that the approach taken in our work is tailored for accessing one
given metric, in our case some form of throughput of successfully
executed atomic
blocks.
However, to measure failure, we seem to need two things:
 1) the rate of failed hardware transactions
 2) the time spent waiting for the serial lock

In short, the performance will be bad if there is a mix of those two
issues that is bad
enough. The problem is how to define a metric that considers those two together?
It is not obvious to me that there is one. Furthermore, that deviates
significantly from
our approach, and in the end --- even if there is a solution that
works in that way ---
that is a different approach, not one that can be applied to our
self-tuning framework.

What we are doing right now is incrementing a counter in a (most
likely cached line)
that is single-writer and multiple-reader. Most often, it will be
owned by the single-writer.
As such, the cost should be quite negligible compared to the execution
of the atomic
block, more so in the case that a Hardware transaction was used, as
the commit itself
should be a hundred cycles or more.

A good way to measure this impact is to use the new "htm_notune" flag
and compare
its performance with the non-changed libitm that I use as a baseline above.
The average results are similar, without any trend noticeable outside
the standard
deviation for one side or the other.




>
> >
> >>
> >> > Also, note that the mitigation strategy for rdtsc
> >> > short-comings that you mention in the paper is not applicable in
> >> > general, specifically not in libitm.
> >>
> >>
> >> I suppose you mean the preemption of threads inflating the cycles measured?
> >
> > Yes, preemption and migration of threads (in case there's no global
> > sync'ed TSC or similar) -- you mentioned in the paper that you pin
> > threads to cores...
> >
> >> This would be similarly a problem to any time source that tries to measure the
> >> amount of work performed; not sure how we can avoid it in general. Any thoughts?
> >
> > Not really as long as we keep depending on measuring time in a
> > light-weight way.  Measuring smaller time intervals could make it less
> > likely that preemption happens during such an interval, though.
> >


Note that in this patch we use no such pinning. In fact, I've measured
that it does not
have a noticeable impact in performance. Maybe that could be the case with heavy
over-subscription of the cores with lots of threads.
Still, it is not a correctness issue, so I believe that the fact this
performs great in the
common case should make it prevail.



>
> >>
> >> > Another issue is that we need to implement the tuning in a portable way.
> >> > You currently make it depend on whether the assembler knows about RTM,
> >> > whereas the rest of the code makes this more portable and defers to
> >> > arch-specific files.  I'd prefer if we could use the tuning on other
> >> > archs too.  But for that, we need to cleanly separate generic from
> >> > arch-specific parts.  That affects magic numbers as well as things like
> >> > rdtsc().
> >>
> >>
> >> Yes, I refrained from adding new calls to the arch-specific files, to
> >> contain the
> >> changes mainly. But that is possible and that's part of the feedback I
> >> was hoping
> >> to get.
> >
> > OK.  Let me know if you want further input regarding this.


I have established this arch-specific part in the new patch. PTAL



>
> >
> >> > I'm wondering about whether it really makes sense to treat XABORT like
> >> > conflicts and other abort reasons, instead of like capacity aborts.
> >> > Perhaps we need to differentiate between the explicit abort codes glibc
> >> > uses, so that we can distinguish between cases where an abort is
> >> > supposed to signal incompatibility with txnal execution and cases where
> >> > it's just used for lock elision (see sysdeps/unix/sysv/linux/x86/hle.h
> >> > in current glibc):
> >> > #define _ABORT_LOCK_BUSY        0xff
> >> > #define _ABORT_LOCK_IS_LOCKED   0xfe
> >> > #define _ABORT_NESTED_TRYLOCK   0xfd
> >>
> >>
> >> I am not sure I follow: are you suggesting to consider other aborts besides
> >> those of capacity?
> >
> > We may want to do this with UCB, similar to capacity.  Another option
> > would be to, for now, make fixed choices regarding whether they are
> > considered permanent or not.
> > _ABORT_NESTED_TRYLOCK is an incompatibility with txnal execution
> > (similar to illegal instructions and such).
> > _ABORT_LOCK_BUSY is failing lock elision due to the lock being already
> > acquired; thus, this is transient I'd say.
> >
> > Andi, what was _ABORT_LOCK_IS_LOCKED used for again?


This would seem a more or less trivial change, after it is agreed
upon. So I suggest
we leave it for later if/when this one gets through.


 >
>
> >> > > --- libitm/config/x86/target.h (revision 219316)
> >> > > +++ libitm/config/x86/target.h (working copy)
> >> > > @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
> >> > >    return begin_ret & _XABORT_RETRY;
> >> > >  }
> >> > >
> >> > > +static inline bool
> >> > > +htm_is_capacity_abort(uint32_t begin_ret)
> >> > > +{
> >> > > +  return begin_ret & _XABORT_CAPACITY;
> >> > > +}
> >> >
> >> > Is this indeed just about capacity, or do we actually mean a more
> >> > general situation?
> >>
> >> The idea of the Upper Confidence Bounds (UCB) part of the algorithm was
> >> to detect "real" capacity aborts (i.e., permanent) vs those that are
> >> "transient".
> >> So in this case we really wanted to know if the abort was a capacity one so
> >> that we could direct the UCB algorithm and understand the "type" of capacity
> >> abort.
> >
> > OK.  But do you want to single out capacity aborts as one case for which
> > you want to detect permanent vs. transient, or do you want to generally
> > find out whether aborts that are reported as permanent (e.g., capacity)
> > are indeed permanent?  In the latter case, a more generic name would be
> > more appropriate.  (And that's not just a naming thing, but guidance for
> > other archs regarding what they should put in the "capacity" bucket of
> > abort reasons...)
> >
> >


As far as we know, capacity aborts are the ones that fit our criteria:
they are identifiable
in a different abort category, yet they are not necessarily
persistent/deterministic.
This could be expanded to others in the future, but it is not
something we tested with, as
other abort types are not as widespread.





Updated patch follows:

Index: util.cc
===================================================================
--- util.cc (revision 219316)
+++ util.cc (working copy)
@@ -94,4 +94,15 @@ xrealloc (void *old, size_t size, bool separate_cl
   return r;
 }

+// State for the generation is not synchronized. It assumes single thread
+// usage at any time. It is, in fact, invoked by the thread that is
+// re-optimizing the libitm HTM usage, so the assumption holds.
+uint32_t
+random_int(uint32_t max)
+{
+  static uint32_t seed = 123456789;
+  seed = (1103515245 * seed + 12345);
+  return seed;
+}
+
 } // namespace GTM
Index: libitm.h
===================================================================
--- libitm.h (revision 219316)
+++ libitm.h (working copy)
@@ -101,7 +101,11 @@ typedef enum
    /* These are not part of the ABI but used for custom HTM fast paths.  See
       ITM_beginTransaction and gtm_thread::begin_transaction.  */
    pr_HTMRetryableAbort = 0x800000,
-   pr_HTMRetriedAfterAbort = 0x1000000
+   pr_HTMRetriedAfterAbort = 0x1000000,
+   /* Reports whether a capacity abort was detected in the custom HTM fast
+      path, so that it can be used in the abort handler in
+      gtm_thread::begin_transaction.  */
+   pr_HTMCapacityAbort          = 0x2000000
 } _ITM_codeProperties;

 /* Result from startTransaction that describes what actions to take.
Index: method-serial.cc
===================================================================
--- method-serial.cc (revision 219316)
+++ method-serial.cc (working copy)
@@ -223,6 +223,19 @@ struct htm_mg : public method_group
     // initially disabled.
 #ifdef USE_HTM_FASTPATH
     htm_fastpath = htm_init();
+    optimizer.optimized_mode = STUBBORN;
+    optimizer.last_cycles = get_efficient_time();
+    optimizer.last_total_txs_executed = 0;
+    optimizer.last_throughput = optimizer.MIN_THROUGHPUT;
+    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
+    optimizer.best_ever_throughput = optimizer.last_throughput;
+    optimizer.best_ever_attempts = htm_fastpath;
+    optimizer.txns_while_stubborn = 1;
+    optimizer.cycles_while_stubborn = optimizer.INIT_CYCLES;
+    optimizer.txns_while_halven = 1;
+    optimizer.cycles_while_halven = optimizer.INIT_CYCLES;
+    optimizer.txns_while_giveup = 1;
+    optimizer.cycles_while_giveup = optimizer.INIT_CYCLES;
 #endif
   }
   virtual void fini()
Index: retry.cc
===================================================================
--- retry.cc (revision 219316)
+++ retry.cc (working copy)
@@ -218,7 +218,6 @@ GTM::gtm_thread::set_default_dispatch(GTM::abi_dis
   default_dispatch.store(disp, memory_order_relaxed);
 }

-
 static GTM::abi_dispatch*
 parse_default_method()
 {
@@ -254,10 +253,17 @@ parse_default_method()
       disp = GTM::dispatch_ml_wt();
       env += 5;
     }
+  else if (strncmp(env, "htm_notune", 10) == 0)
+    {
+      disp = GTM::dispatch_htm();
+      env += 10;
+      GTM::optimizer.set_is_enabled(0);
+    }
   else if (strncmp(env, "htm", 3) == 0)
     {
       disp = GTM::dispatch_htm();
       env += 3;
+      GTM::optimizer.set_is_enabled(1);
     }
   else
     goto unknown;
Index: beginend.cc
===================================================================
--- beginend.cc (revision 219316)
+++ beginend.cc (working copy)
@@ -39,6 +39,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
 gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
 atomic<gtm_version> GTM::gtm_clock;

+// Holds the optimized configuration for parameters relevant to HTM execution.
+gtm_global_optimizer GTM::optimizer;
+
 /* ??? Move elsewhere when we figure out library initialization.  */
 uint64_t GTM::gtm_spin_count_var = 1000;

@@ -95,7 +98,139 @@ thread_exit_init()
     GTM_fatal("Creating thread release TLS key failed.");
 }

+/* Implementations for the fixed-point arithmetic type. */
+const uint32_t GTM::fixed_pt_t::FIXED_PT_WIDTH = 64;
+const uint32_t GTM::fixed_pt_t::FIXED_PT_INTEGER_WIDTH = 40;
+const uint32_t GTM::fixed_pt_t::FIXED_PT_FRAC_WIDTH =
+  GTM::fixed_pt_t::FIXED_PT_WIDTH - GTM::fixed_pt_t::FIXED_PT_INTEGER_WIDTH;
+const fixed_pt_t GTM::fixed_pt_t::FIXED_PT_ONE = GTM::fixed_pt_t::one();
+const fixed_pt_t GTM::fixed_pt_t::FIXED_PT_TWO =
+  GTM::fixed_pt_t::FIXED_PT_ONE.add(GTM::fixed_pt_t::FIXED_PT_ONE);

+fixed_pt_t
+GTM::fixed_pt_t::one()
+{
+  fixed_pt_t result(0);
+  result.number = static_cast<uint64_t>(1) << FIXED_PT_FRAC_WIDTH;
+  return result;
+}
+
+fixed_pt_t
+GTM::fixed_pt_t::add(const fixed_pt_t operand) const
+{
+  fixed_pt_t result(0);
+  result.number = this->number + operand.number;
+  return result;
+}
+
+fixed_pt_t
+GTM::fixed_pt_t::sub(const fixed_pt_t operand) const
+{
+  fixed_pt_t result(0);
+  result.number = this->number - operand.number;
+  return result;
+}
+
+fixed_pt_t
+GTM::fixed_pt_t::mul(const fixed_pt_t operand) const
+{
+  fixed_pt_t result(0);
+  result.number = (static_cast<__uint128_t>(this->number) *
+     static_cast<__uint128_t>(operand.number)) >> FIXED_PT_FRAC_WIDTH;
+  return result;
+}
+
+fixed_pt_t
+GTM::fixed_pt_t::div(const fixed_pt_t operand) const
+{
+  fixed_pt_t result(0);
+  result.number =
+    (static_cast<__uint128_t>(this->number) << FIXED_PT_FRAC_WIDTH) /
+      static_cast<__uint128_t>(operand.number);
+  return result;
+}
+
+fixed_pt_t
+GTM::fixed_pt_t::sqrt() const
+{
+  uint64_t invert = 0;
+  uint64_t iter = FIXED_PT_FRAC_WIDTH;
+  uint64_t i;
+  fixed_pt_t n = *this;
+
+  if (n.number == 0 || n.number == FIXED_PT_ONE.number)
+    {
+      return n;
+    }
+  if (n.number < FIXED_PT_ONE.number && n.number > 6)
+    {
+      invert = 1;
+      n = FIXED_PT_ONE.div(n);
+    }
+  if (n.number > FIXED_PT_ONE.number)
+    {
+      uint64_t s = n.number;
+      iter = 0;
+      while (s > 0)
+        {
+          s >>= 2;
+          iter++;
+        }
+    }
+
+  fixed_pt_t l(0);
+  l.number = (n.number >> 1) + 1;
+  for (i = 0; i < iter; i++)
+    {
+      l.number = l.add(n.div(l)).number >> 1;
+    }
+  if (invert)
+    {
+      return FIXED_PT_ONE.div(l);
+    }
+  return l;
+}
+
+fixed_pt_t
+GTM::fixed_pt_t::ln() const
+{
+  const fixed_pt_t LN2 = fixed_pt_t(0.69314718055994530942);
+  const fixed_pt_t LG[7] = {
+    fixed_pt_t(6.666666666666735130e-01),
+    fixed_pt_t(3.999999999940941908e-01),
+    fixed_pt_t(2.857142874366239149e-01),
+    fixed_pt_t(2.222219843214978396e-01),
+    fixed_pt_t(1.818357216161805012e-01),
+    fixed_pt_t(1.531383769920937332e-01),
+    fixed_pt_t(1.479819860511658591e-01)
+  };
+
+  if (this->number == 0)
+    {
+      return 0xffffffff;
+    }
+
+  fixed_pt_t log2 = fixed_pt_t(0);
+  fixed_pt_t xi = *this;
+  while (xi.number > FIXED_PT_TWO.number)
+    {
+      xi.number >>= 1;
+      log2.number++;
+    }
+
+  fixed_pt_t f = xi.sub(FIXED_PT_ONE);
+  fixed_pt_t s = f.div(FIXED_PT_TWO.add(f));
+  fixed_pt_t z = s.mul(s);
+  fixed_pt_t w = z.mul(z);
+  fixed_pt_t R =
+    w.mul(LG[1].add(w.mul(LG[3].add(w.mul(LG[5]))))).add(
+      z.mul(LG[0].add(w.mul(LG[2].add(
+        w.mul(LG[4].add(w.mul(LG[6]))))))));
+
+  return LN2.mul(fixed_pt_t(log2.number << FIXED_PT_FRAC_WIDTH)).add(
+    f.sub(s.mul(f.sub(R))));
+}
+
 GTM::gtm_thread::~gtm_thread()
 {
   if (nesting > 0)
@@ -149,6 +284,180 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
     return a_runInstrumentedCode;
 }

+// Periodically invoked by some thread(s) that are in the HTM fall-back path.
+// This implements the Upper Confidence Bounds and Gradient Descent to tune,
+// respectively, how to deal with HTM capacity aborts and how many HTM retries
+// to allow. For more details, check Diegues et al. in USENIX ICAC 2014.
+void
+GTM::gtm_global_optimizer::reoptimize_htm_execution()
+{
+  // Avoid redundant re-optimization if another thread may be doing it already.
+  // This allows the threads to avoid racing to re-optimize concurrently, which
+  // is useless.
+
+  if (unlikely(GTM::gtm_thread::serial_lock.is_write_locked()))
+    {
+      return;
+    }
+  // Eventually, some thread(s) will follow this path, even in the face of
+  // spurious short-cut exits due to the serial lock being taken.
+  GTM::gtm_thread::serial_lock.write_lock();
+
+  // Collect the statistics obtained so far.
+  uint64_t total_txs_executed = 0;
+  gtm_thread *ptr = GTM::gtm_thread::list_of_threads;
+  for (; ptr; ptr = ptr->next_thread)
+    {
+      total_txs_executed += ptr->number_executed_txns;
+    }
+
+  // Obtain the delta performance with respect to the last period measured.
+  // We expect the underlying architecture to provide an efficient way to
+  // measure time passed since the last re-optimization.
+  const uint64_t current_cycles = get_efficient_time();
+  const uint64_t cycles_used = current_cycles - optimizer.last_cycles;
+  const uint64_t new_txs_executed =
+    total_txs_executed - optimizer.last_total_txs_executed;
+  const fixed_pt_t current_throughput =
+    fixed_pt_t(new_txs_executed).div(fixed_pt_t(cycles_used));
+
+  if (current_throughput.number == 0)
+    {
+      // This thread probably executed right after another one as they both
+      // found it was time to re-optimize, but did not contend on the serial
+      // lock. As such, we avoid redundant re-optimizations and short cut out.
+      GTM::gtm_thread::serial_lock.write_unlock();
+      return;
+    }
+
+  optimizer.last_cycles = current_cycles;
+  optimizer.last_total_txs_executed = total_txs_executed;
+  if (optimizer.optimized_mode == STUBBORN)
+    {
+      optimizer.txns_while_stubborn += new_txs_executed;
+      optimizer.cycles_while_stubborn += cycles_used;
+    }
+  else if (optimizer.optimized_mode == HALVEN)
+    {
+      optimizer.txns_while_halven += new_txs_executed;
+      optimizer.cycles_while_halven += cycles_used;
+    }
+  else
+    {
+      optimizer.txns_while_giveup += new_txs_executed;
+      optimizer.cycles_while_giveup += cycles_used;
+    }
+
+  // Compute Upper Confidence Bounds for the mode to choose next to decide on
+  // how to deal with HTM capacity aborts.
+  // Use fixed point arithmetic types to spare floating point usage.
+  const fixed_pt_t log_sum =
+    GTM::fixed_pt_t::FIXED_PT_TWO.mul(
+      (fixed_pt_t(optimizer.txns_while_stubborn).add(
+       fixed_pt_t(optimizer.txns_while_halven)).add(
+       fixed_pt_t(optimizer.txns_while_giveup))).ln());
+  const fixed_pt_t ucb_stubborn =
+    ((fixed_pt_t(optimizer.txns_while_stubborn).div(
+      fixed_pt_t(optimizer.cycles_while_stubborn))).div(
+      optimizer.best_ever_throughput)).add(
+     (log_sum.div(fixed_pt_t(optimizer.txns_while_stubborn))).sqrt());
+  const fixed_pt_t ucb_halven =
+    ((fixed_pt_t(optimizer.txns_while_halven).div(
+      fixed_pt_t(optimizer.cycles_while_halven))).div(
+      optimizer.best_ever_throughput)).add(
+     (log_sum.div(fixed_pt_t(optimizer.txns_while_halven))).sqrt());
+  const fixed_pt_t ucb_giveup =
+    ((fixed_pt_t(optimizer.txns_while_giveup).div(
+      fixed_pt_t(optimizer.cycles_while_giveup)).div(
+      optimizer.best_ever_throughput))).add(
+     (log_sum.div(fixed_pt_t(optimizer.txns_while_giveup))).sqrt());
+
+  if (ucb_stubborn.number > ucb_halven.number &&
+      ucb_stubborn.number > ucb_giveup.number)
+    {
+      optimizer.optimized_mode = STUBBORN;
+    }
+  else if (ucb_halven.number > ucb_giveup.number)
+    {
+      optimizer.optimized_mode = HALVEN;
+    }
+  else
+    {
+      optimizer.optimized_mode = GIVEUP;
+    }
+
+  // Compute Gradient Descent to regulate the number of retries in HTM.
+  const fixed_pt_t LARGE_DEGRADATION = fixed_pt_t(1.40);
+  const fixed_pt_t MIN_IMPROVEMENT = fixed_pt_t(1.05);
+  const int32_t MAX_RETRIES = 20;
+  const int32_t MIN_RETRIES = 2;
+
+  const fixed_pt_t change_for_better =
+    current_throughput.div(optimizer.last_throughput);
+  const fixed_pt_t change_for_worse =
+    optimizer.last_throughput.div(current_throughput);
+  int32_t last_attempts = optimizer.last_attempts;
+  int32_t current_attempts = htm_fastpath;
+  int32_t new_attempts = current_attempts;
+  if (unlikely(change_for_worse.number > fixed_pt_t(LARGE_DEGRADATION).number))
+    {
+      htm_fastpath = optimizer.best_ever_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.last_attempts = current_attempts;
+      goto finish_reoptimization;
+    }
+
+  if (unlikely(random_int(100) == 0))
+    {
+      optimizer.last_attempts = htm_fastpath;
+      optimizer.last_throughput = current_throughput;
+      htm_fastpath = random_int(MAX_RETRIES - MIN_RETRIES) + MIN_RETRIES;
+      goto finish_reoptimization;
+    }
+
+  if (change_for_better.number > MIN_IMPROVEMENT.number)
+    {
+      new_attempts += current_attempts - last_attempts;
+      if (current_attempts == last_attempts)
+ {
+  new_attempts += random_int(2) == 0 ? -1 : 1;
+ }
+    }
+  else if (change_for_worse.number > MIN_IMPROVEMENT.number)
+    {
+      new_attempts -= current_attempts - last_attempts;
+      if (current_attempts == last_attempts)
+ {
+  new_attempts += random_int(2) == 0 ? -1 : 1;
+ }
+    }
+
+  if (unlikely(new_attempts > MAX_RETRIES))
+    {
+      new_attempts = MAX_RETRIES;
+    }
+  else if (unlikely(new_attempts < MIN_RETRIES))
+    {
+      new_attempts = MIN_RETRIES;
+    }
+
+  if (current_attempts != new_attempts)
+    {
+      optimizer.last_attempts = current_attempts;
+      optimizer.last_throughput = current_throughput;
+    }
+
+  htm_fastpath = new_attempts;
+  if (unlikely(current_throughput.number >
optimizer.best_ever_throughput.number))
+    {
+      optimizer.best_ever_throughput = current_throughput;
+      optimizer.best_ever_attempts = current_attempts;
+    }
+
+ finish_reoptimization:
+  GTM::gtm_thread::serial_lock.write_unlock();
+}
+
 uint32_t
 GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 {
@@ -209,8 +518,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
     }
   // The transaction has aborted.  Don't retry if it's unlikely that
   // retrying the transaction will be successful.
-  if (!htm_abort_should_retry(ret))
-    break;
+  if (optimizer.is_enabled)
+    {
+      // If the HTM self-tuning is enabled, then we act according to
+      // its configuration.
+      if (htm_is_capacity_abort(ret))
+ {
+  gtm_capacity_abort_mode capacity_mode =
+    optimizer.optimized_mode;
+  // Adapt the retries according to the mode. Note that the
+  // linear decrease is implicit in the outer for loop.
+  if (capacity_mode == HALVEN)
+    t = t / 2 + 1;
+  else if (capacity_mode == GIVEUP)
+    break;
+ }
+
+      // Take the chance of having aborted to possibly re-optimize the
+      // the HTM configuration. Do this as a last resort before going
+      // for the serial lock.
+      if (unlikely(tx->number_executed_txns %
+   optimizer.OPTIMIZATION_PERIOD_TXS == 0 && t == 1))
+ optimizer.reoptimize_htm_execution();
+    }
+  else
+    {
+      // Execute the fixed logic when there is no adaptation.
+      if (!htm_abort_should_retry(ret))
+ break;
+    }
+
   // Wait until any concurrent serial-mode transactions have finished.
   // This is an empty critical section, but won't be elided.
   if (serial_lock.is_write_locked())
@@ -248,7 +585,11 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
   // HTM fastpath aborted, and that we thus have to decide whether to retry
   // the fastpath (returning a_tryHTMFastPath) or just proceed with the
   // fallback method.
-  if (likely(htm_fastpath && (prop & pr_HTMRetryableAbort)))
+  // Even if pr_HTMRetryableAbort is not true, we still consider to retry if
+  // it was a capacity abort (stated by pr_HTMCapacityAbort) and the self
+  // tuning capabilities are enabled (stated by optimizer.is_enabled).
+  if (likely(htm_fastpath && ((prop & pr_HTMRetryableAbort)
+      || ((prop & pr_HTMCapacityAbort) && (optimizer.is_enabled)))))
     {
       tx = gtm_thr();
       if (unlikely(tx == NULL))
@@ -266,6 +607,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,

       if (--tx->restart_total > 0)
  {
+  // See above: we use the optimized configuration if enabled.
+  if (optimizer.is_enabled)
+    {
+      // See above: similarly, we adapt upon capacity aborts.
+      if (prop & pr_HTMCapacityAbort)
+ {
+  gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+  if (capacity_mode == HALVEN)
+    tx->restart_total = tx->restart_total / 2 + 1;
+  else if (capacity_mode == GIVEUP)
+    goto stop_custom_htm_fastpath;
+ }
+
+      // See above: similarly, we periodically re-optimize.
+      if (unlikely(tx->number_executed_txns %
+  optimizer.OPTIMIZATION_PERIOD_TXS == 0 &&
+  tx->restart_total == 1))
+ optimizer.reoptimize_htm_execution();
+    }
+
   // Wait until any concurrent serial-mode transactions have finished.
   // Essentially the same code as above.
   if (serial_lock.is_write_locked())
@@ -665,12 +1026,23 @@ _ITM_commitTransaction(void)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+ {
+  // See above: when using the HTM fast-path, we might not have created
+  // a thread object yet.
+  tx = new gtm_thread();
+  set_gtm_thr(tx);
+ }
+      tx->number_executed_txns++;
       return;
     }
 #endif
   gtm_thread *tx = gtm_thr();
   if (!tx->trycommit ())
     tx->restart (RESTART_VALIDATE_COMMIT);
+
+  tx->number_executed_txns++;
 }

 void ITM_REGPARM
@@ -681,6 +1053,14 @@ _ITM_commitTransactionEH(void *exc_ptr)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+ {
+  // See above.
+  tx = new gtm_thread();
+  set_gtm_thr(tx);
+ }
+      tx->number_executed_txns++;
       return;
     }
 #endif
@@ -690,4 +1070,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
       tx->eh_in_flight = exc_ptr;
       tx->restart (RESTART_VALIDATE_COMMIT);
     }
+  tx->number_executed_txns++;
 }
Index: config/s390/target.h
===================================================================
--- config/s390/target.h (revision 219316)
+++ config/s390/target.h (working copy)
@@ -116,12 +116,30 @@ htm_abort_should_retry (uint32_t begin_ret)
 }

 static inline bool
+htm_is_capacity_abort (uint32_t begin_ret)
+{
+  return begin_ret != _HTM_TBEGIN_PERSISTENT;
+}
+
+static inline bool
 htm_transaction_active ()
 {
   __asm volatile (".machine \"all\"  \n\t");
   return __builtin_tx_nesting_depth() != 0;
 }

+static inline uint64_t
+get_efficient_time()
+{
+    return 0;
+}
+
+static inline bool
+allows_reoptimization()
+{
+  return false;
+}
+
 #endif

 } // namespace GTM
Index: config/arm/target.h
===================================================================
--- config/arm/target.h (revision 219316)
+++ config/arm/target.h (working copy)
@@ -46,4 +46,10 @@ cpu_relax (void)
   __sync_synchronize ();
 }

+static inline bool
+allows_reoptimization()
+{
+  return false;
+}
+
 } // namespace GTM
Index: config/powerpc/target.h
===================================================================
--- config/powerpc/target.h (revision 219316)
+++ config/powerpc/target.h (working copy)
@@ -74,6 +74,7 @@ cpu_relax (void)
 #define _TBEGIN_STARTED       0
 #define _TBEGIN_INDETERMINATE 1
 #define _TBEGIN_PERSISTENT    2
+#define _TBEGIN_CAPACITY      3

 /* Number of retries for transient failures.  */
 #define _HTM_RETRIES 10
@@ -101,6 +102,9 @@ htm_begin (void)
   if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
     return _TBEGIN_PERSISTENT;

+  if (_TEXASRU_FOOTPRINT_OVERFLOW (__builtin_get_texasru ()))
+    return _TBEGIN_CAPACITY;
+
   return _TBEGIN_INDETERMINATE;
 }

@@ -128,6 +132,12 @@ htm_abort_should_retry (uint32_t begin_ret)
   return begin_ret != _TBEGIN_PERSISTENT;
 }

+static inline bool
+htm_is_capacity_abort (uint32_t begin_ret)
+{
+  return begin_ret == _TBEGIN_CAPACITY;
+}
+
 /* Returns true iff a hardware transaction is currently being executed.  */
 static inline bool
 htm_transaction_active (void)
@@ -135,6 +145,36 @@ htm_transaction_active (void)
   return (_HTM_STATE (__builtin_ttest ()) == _HTM_TRANSACTIONAL);
 }

+static inline uint64_t
+get_efficient_time()
+{
+  #if defined (__powerpc64__) || defined (__ppc64__)
+  uint64_t __tb;
+  __asm__ volatile ("mfspr %[tb], 268\n"
+    : [tb]"=r" (__tb)
+    : );
+  return __tb;
+  #else
+  register unsigned long __tbu, __tbl, __tmp; \
+  __asm__ volatile (
+    "0:\n"
+    "mftbu %[tbu]\n"
+    "mftbl %[tbl]\n"
+    "mftbu %[tmp]\n"
+    "cmpw %[tbu], %[tmp]\n"
+    "bne- 0b\n"
+    : [tbu]"=r" (__tbu), [tbl]"=r" (__tbl), [tmp]"=r" (__tmp)
+    : );
+  return (( (uint64_t) __tbu << 32) | __tbl);
+  #endif /* not __powerpc64__ */
+}
+
+static inline bool
+allows_reoptimization()
+{
+  return true;
+}
+
 #endif

 } // namespace GTM
Index: config/alpha/target.h
===================================================================
--- config/alpha/target.h (revision 219316)
+++ config/alpha/target.h (working copy)
@@ -41,4 +41,10 @@ cpu_relax (void)
   __asm volatile ("" : : : "memory");
 }

+static inline bool
+allows_reoptimization()
+{
+  return false;
+}
+
 } // namespace GTM
Index: config/x86/sjlj.S
===================================================================
--- config/x86/sjlj.S (revision 219316)
+++ config/x86/sjlj.S (working copy)
@@ -59,12 +59,14 @@
 #define pr_hasNoAbort 0x08
 #define pr_HTMRetryableAbort 0x800000
 #define pr_HTMRetriedAfterAbort 0x1000000
+#define pr_HTMCapacityAbort     0x2000000
 #define a_runInstrumentedCode 0x01
 #define a_runUninstrumentedCode 0x02
 #define a_tryHTMFastPath 0x20

 #define _XABORT_EXPLICIT (1 << 0)
 #define _XABORT_RETRY (1 << 1)
+#define _XABORT_CAPACITY (1 << 3)

  .text

@@ -111,6 +113,9 @@ SYM(_ITM_beginTransaction):
  testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
  jz .Lno_htm
  orl $pr_HTMRetryableAbort, %edi
+ testl   $(_XABORT_CAPACITY), %eax
+ jz      .Lno_htm
+ orl     $pr_HTMCapacityAbort, %edi
  /* Let the C++ code handle the retry policy.  */
 .Lno_htm:
 #endif
Index: config/x86/target.h
===================================================================
--- config/x86/target.h (revision 219316)
+++ config/x86/target.h (working copy)
@@ -126,13 +126,33 @@ htm_abort_should_retry (uint32_t begin_ret)
   return begin_ret & _XABORT_RETRY;
 }

+static inline bool
+htm_is_capacity_abort(uint32_t begin_ret)
+{
+  return begin_ret & _XABORT_CAPACITY;
+}
+
 /* Returns true iff a hardware transaction is currently being executed.  */
 static inline bool
 htm_transaction_active ()
 {
   return _xtest() != 0;
 }
+
+static inline uint64_t
+get_efficient_time()
+{
+  uint32_t hi, lo;
+  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+  return ((unsigned long long) lo) | (((unsigned long long) hi) << 32);
+}
+
+static inline bool
+allows_reoptimization()
+{
+  return true;
+}
+
 #endif

-
 } // namespace GTM
Index: config/aarch64/target.h
===================================================================
--- config/aarch64/target.h (revision 219316)
+++ config/aarch64/target.h (working copy)
@@ -42,4 +42,10 @@ cpu_relax (void)
   __asm volatile ("" : : : "memory");
 }

+static inline bool
+allows_reoptimization()
+{
+  return false;
+}
+
 } // namespace GTM
Index: config/sparc/target.h
===================================================================
--- config/sparc/target.h (revision 219316)
+++ config/sparc/target.h (working copy)
@@ -39,4 +39,10 @@ cpu_relax (void)
   __asm volatile ("rd %%ccr, %%g0" : : : "memory");
 }

+static inline bool
+allows_reoptimization()
+{
+  return false;
+}
+
 } // namespace GTM
Index: config/sh/target.h
===================================================================
--- config/sh/target.h (revision 219316)
+++ config/sh/target.h (working copy)
@@ -44,4 +44,10 @@ cpu_relax (void)
   __asm volatile ("" : : : "memory");
 }

+static inline bool
+allows_reoptimization()
+{
+  return false;
+}
+
 } // namespace GTM
Index: libitm_i.h
===================================================================
--- libitm_i.h (revision 219316)
+++ libitm_i.h (working copy)
@@ -242,6 +242,9 @@ struct gtm_thread
   uint32_t restart_reason[NUM_RESTARTS];
   uint32_t restart_total;

+  // Keeps track of how many transactions were successfully executed.
+  uint64_t number_executed_txns;
+
   // *** The shared part of gtm_thread starts here. ***
   // Shared state is on separate cachelines to avoid false sharing with
   // thread-local parts of gtm_thread.
@@ -309,6 +312,94 @@ struct gtm_thread
   void commit_user_actions ();
 };

+// Definition of fixed point arithmetic types.
+// Half the bits are dedicated to the fractional type, and the rest to the
+// "whole" part.
+struct fixed_pt_t
+{
+  uint64_t number;
+
+  fixed_pt_t(double n)
+  {
+    this->number = static_cast<uint64_t>(n * FIXED_PT_ONE.number + (n
>= 0 ? 0.5 : -0.5));
+  }
+
+  fixed_pt_t add(const fixed_pt_t operand) const;
+  fixed_pt_t sub(const fixed_pt_t operand) const;
+  fixed_pt_t mul(const fixed_pt_t operand) const;
+  fixed_pt_t div(const fixed_pt_t operand) const;
+  fixed_pt_t sqrt() const;
+  fixed_pt_t ln() const;
+
+  static fixed_pt_t one();
+
+  static const uint32_t FIXED_PT_WIDTH;
+  static const uint32_t FIXED_PT_INTEGER_WIDTH;
+  static const uint32_t FIXED_PT_FRAC_WIDTH;
+  static const fixed_pt_t FIXED_PT_ONE;
+  static const fixed_pt_t FIXED_PT_TWO;
+};
+
+// Different ways to deal with capacity aborts in HTM execution.
+enum gtm_capacity_abort_mode
+{
+  STUBBORN,
+  HALVEN,
+  GIVEUP
+};
+
+// Maintains the current values optimized for HTM execution and the
+// corresponding statistics gathered for the decision-making.
+struct gtm_global_optimizer
+{
+  // Whether to re-optimize the HTM execution. This is parametrizable by an
+  // env variable.
+  uint32_t is_enabled;
+
+  // Mode chosen to currently deal with capacity aborts.
+  gtm_capacity_abort_mode optimized_mode;
+  // The number of attempts chosen to currently insist on HTM execution is
+  // maintained in htm_fastpath.
+
+  uint64_t last_cycles;
+  uint64_t last_total_txs_executed;
+
+  fixed_pt_t last_throughput;
+  uint32_t last_attempts;
+
+  fixed_pt_t best_ever_throughput;
+  uint32_t best_ever_attempts;
+
+  uint64_t txns_while_stubborn;
+  uint64_t cycles_while_stubborn;
+  uint64_t txns_while_halven;
+  uint64_t cycles_while_halven;
+  uint64_t txns_while_giveup;
+  uint64_t cycles_while_giveup;
+
+  gtm_global_optimizer() :
+    last_throughput(fixed_pt_t(0.0001)),
+    best_ever_throughput(fixed_pt_t(0.0001))
+  {
+  }
+
+  void
+  set_is_enabled(bool arg)
+  {
+    is_enabled = allows_reoptimization() && arg;
+  }
+
+  void reoptimize_htm_execution();
+
+  // Period, in number of transactions, between which we execute the
+  // re-optimization procedure.
+  static const uint32_t OPTIMIZATION_PERIOD_TXS = 500;
+  // Initialization value for the last, fictional, throughput measured.
+  static const fixed_pt_t MIN_THROUGHPUT = fixed_pt_t(0.001);
+  // This serves as an initialization for the cycles used so far.
+  static const uint32_t INIT_CYCLES = 100;
+};
+
 } // namespace GTM

 #include "tls.h"
@@ -346,6 +437,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
 // the name, avoiding complex name mangling.
 extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");

+// Maintains the optimization for HTM execution.
+extern gtm_global_optimizer optimizer;
+
 } // namespace GTM

 #endif // LIBITM_I_H
Index: common.h
===================================================================
--- common.h (revision 219316)
+++ common.h (working copy)
@@ -59,6 +59,11 @@ extern void * xcalloc (size_t s, bool separate_cl
 extern void * xrealloc (void *p, size_t s, bool separate_cl = false)
   __attribute__((malloc, nothrow));

+// Linear congruential number generator.
+//
+// Used also in glibc for generators with small state.
+extern uint32_t random_int(uint32_t max);
+
 } // namespace GTM

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-08-24 12:18                     ` Nuno Diegues
@ 2015-11-02  8:58                       ` Nuno Diegues
  2015-11-02 20:59                         ` Andi Kleen
  0 siblings, 1 reply; 20+ messages in thread
From: Nuno Diegues @ 2015-11-02  8:58 UTC (permalink / raw)
  To: Torvald Riegel; +Cc: Andi Kleen, gcc-patches, Paolo Romano

Hello everyone,

gently pinging to bring this back to life given the last patch I emailed.


Best regards,
-- Nuno Diegues

On Mon, Aug 24, 2015 at 12:51 PM, Nuno Diegues <nmld@ist.utl.pt> wrote:
> Hello everyone,
>
> after a summer internship and some backlog catching up in the past
> weeks, I have finally got around to review the patch according to the
> latest discussion.
>
> The changes have not caused regression, and the latest speedup results
> are coherent with what we had before.
>
>
> In the following I add some comments inline to the discussion, and
> after that paste the updated patch.
>
>
>> >
>> > So let's iterate on this in parallel with the other changes that we need
>> > to get in place.  I'd prefer to have some more confidence that measuring
>> > txn throughput in epochs is the best way forward.
>> >
>> > Here are a few thoughts:
>> >
>> > Why do we actually need to measure succeeded transactions?  If a HW txn
>> > is just getting committed without interfering with anything else, is
>> > this really different from, say, two HW txns getting committed?  Aren't
>> > we really just interested in wasted work, or delayed txns?  That may
>> > help taking care of the nontxnal vs. txnal problem.
>> >
>> > Measuring committed txns during a time that might otherwise be spent by
>> > a serial txns could be useful to figure out how much other useful work a
>> > serial txn prevents.  But we'd see that as well if we'd just go serial
>> > during the auto-tuning because then concurrent txns will have to wait;
>> > and in this case we could measure it in the slow path of those
>> > concurrent txns (ie, during waiting, where measurement overhead wouldn't
>> > matter as much).
>> >
>> > If a serial txn is running, concurrent txns (that wait) are free to sync
>> > and tell the serial how much cost it creates for the concurrent txns.
>> > There, txn length could matter, but we won't find out for real until
>> > after the concurrent ones have run (they could be pretty short, so we
>> > can't simply assume that the concurrent ones are as long as the serial
>> > one, so that simply the number of concurrent ones can be used to
>> > calculate delayed work).
>
>
> I understand you concern: measuring failure in a slow path rather than
> success in
> the fast/common path.
>
> My problem is that the approach taken in our work is tailored for accessing one
> given metric, in our case some form of throughput of successfully
> executed atomic
> blocks.
> However, to measure failure, we seem to need two things:
>  1) the rate of failed hardware transactions
>  2) the time spent waiting for the serial lock
>
> In short, the performance will be bad if there is a mix of those two
> issues that is bad
> enough. The problem is how to define a metric that considers those two together?
> It is not obvious to me that there is one. Furthermore, that deviates
> significantly from
> our approach, and in the end --- even if there is a solution that
> works in that way ---
> that is a different approach, not one that can be applied to our
> self-tuning framework.
>
> What we are doing right now is incrementing a counter in a (most
> likely cached line)
> that is single-writer and multiple-reader. Most often, it will be
> owned by the single-writer.
> As such, the cost should be quite negligible compared to the execution
> of the atomic
> block, more so in the case that a Hardware transaction was used, as
> the commit itself
> should be a hundred cycles or more.
>
> A good way to measure this impact is to use the new "htm_notune" flag
> and compare
> its performance with the non-changed libitm that I use as a baseline above.
> The average results are similar, without any trend noticeable outside
> the standard
> deviation for one side or the other.
>
>
>
>
>>
>> >
>> >>
>> >> > Also, note that the mitigation strategy for rdtsc
>> >> > short-comings that you mention in the paper is not applicable in
>> >> > general, specifically not in libitm.
>> >>
>> >>
>> >> I suppose you mean the preemption of threads inflating the cycles measured?
>> >
>> > Yes, preemption and migration of threads (in case there's no global
>> > sync'ed TSC or similar) -- you mentioned in the paper that you pin
>> > threads to cores...
>> >
>> >> This would be similarly a problem to any time source that tries to measure the
>> >> amount of work performed; not sure how we can avoid it in general. Any thoughts?
>> >
>> > Not really as long as we keep depending on measuring time in a
>> > light-weight way.  Measuring smaller time intervals could make it less
>> > likely that preemption happens during such an interval, though.
>> >
>
>
> Note that in this patch we use no such pinning. In fact, I've measured
> that it does not
> have a noticeable impact in performance. Maybe that could be the case with heavy
> over-subscription of the cores with lots of threads.
> Still, it is not a correctness issue, so I believe that the fact this
> performs great in the
> common case should make it prevail.
>
>
>
>>
>> >>
>> >> > Another issue is that we need to implement the tuning in a portable way.
>> >> > You currently make it depend on whether the assembler knows about RTM,
>> >> > whereas the rest of the code makes this more portable and defers to
>> >> > arch-specific files.  I'd prefer if we could use the tuning on other
>> >> > archs too.  But for that, we need to cleanly separate generic from
>> >> > arch-specific parts.  That affects magic numbers as well as things like
>> >> > rdtsc().
>> >>
>> >>
>> >> Yes, I refrained from adding new calls to the arch-specific files, to
>> >> contain the
>> >> changes mainly. But that is possible and that's part of the feedback I
>> >> was hoping
>> >> to get.
>> >
>> > OK.  Let me know if you want further input regarding this.
>
>
> I have established this arch-specific part in the new patch. PTAL
>
>
>
>>
>> >
>> >> > I'm wondering about whether it really makes sense to treat XABORT like
>> >> > conflicts and other abort reasons, instead of like capacity aborts.
>> >> > Perhaps we need to differentiate between the explicit abort codes glibc
>> >> > uses, so that we can distinguish between cases where an abort is
>> >> > supposed to signal incompatibility with txnal execution and cases where
>> >> > it's just used for lock elision (see sysdeps/unix/sysv/linux/x86/hle.h
>> >> > in current glibc):
>> >> > #define _ABORT_LOCK_BUSY        0xff
>> >> > #define _ABORT_LOCK_IS_LOCKED   0xfe
>> >> > #define _ABORT_NESTED_TRYLOCK   0xfd
>> >>
>> >>
>> >> I am not sure I follow: are you suggesting to consider other aborts besides
>> >> those of capacity?
>> >
>> > We may want to do this with UCB, similar to capacity.  Another option
>> > would be to, for now, make fixed choices regarding whether they are
>> > considered permanent or not.
>> > _ABORT_NESTED_TRYLOCK is an incompatibility with txnal execution
>> > (similar to illegal instructions and such).
>> > _ABORT_LOCK_BUSY is failing lock elision due to the lock being already
>> > acquired; thus, this is transient I'd say.
>> >
>> > Andi, what was _ABORT_LOCK_IS_LOCKED used for again?
>
>
> This would seem a more or less trivial change, after it is agreed
> upon. So I suggest
> we leave it for later if/when this one gets through.
>
>
>  >
>>
>> >> > > --- libitm/config/x86/target.h (revision 219316)
>> >> > > +++ libitm/config/x86/target.h (working copy)
>> >> > > @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
>> >> > >    return begin_ret & _XABORT_RETRY;
>> >> > >  }
>> >> > >
>> >> > > +static inline bool
>> >> > > +htm_is_capacity_abort(uint32_t begin_ret)
>> >> > > +{
>> >> > > +  return begin_ret & _XABORT_CAPACITY;
>> >> > > +}
>> >> >
>> >> > Is this indeed just about capacity, or do we actually mean a more
>> >> > general situation?
>> >>
>> >> The idea of the Upper Confidence Bounds (UCB) part of the algorithm was
>> >> to detect "real" capacity aborts (i.e., permanent) vs those that are
>> >> "transient".
>> >> So in this case we really wanted to know if the abort was a capacity one so
>> >> that we could direct the UCB algorithm and understand the "type" of capacity
>> >> abort.
>> >
>> > OK.  But do you want to single out capacity aborts as one case for which
>> > you want to detect permanent vs. transient, or do you want to generally
>> > find out whether aborts that are reported as permanent (e.g., capacity)
>> > are indeed permanent?  In the latter case, a more generic name would be
>> > more appropriate.  (And that's not just a naming thing, but guidance for
>> > other archs regarding what they should put in the "capacity" bucket of
>> > abort reasons...)
>> >
>> >
>
>
> As far as we know, capacity aborts are the ones that fit our criteria:
> they are identifiable
> in a different abort category, yet they are not necessarily
> persistent/deterministic.
> This could be expanded to others in the future, but it is not
> something we tested with, as
> other abort types are not as widespread.
>
>
>
>
>
> Updated patch follows:
>
> Index: util.cc
> ===================================================================
> --- util.cc (revision 219316)
> +++ util.cc (working copy)
> @@ -94,4 +94,15 @@ xrealloc (void *old, size_t size, bool separate_cl
>    return r;
>  }
>
> +// State for the generation is not synchronized. It assumes single thread
> +// usage at any time. It is, in fact, invoked by the thread that is
> +// re-optimizing the libitm HTM usage, so the assumption holds.
> +uint32_t
> +random_int(uint32_t max)
> +{
> +  static uint32_t seed = 123456789;
> +  seed = (1103515245 * seed + 12345);
> +  return seed;
> +}
> +
>  } // namespace GTM
> Index: libitm.h
> ===================================================================
> --- libitm.h (revision 219316)
> +++ libitm.h (working copy)
> @@ -101,7 +101,11 @@ typedef enum
>     /* These are not part of the ABI but used for custom HTM fast paths.  See
>        ITM_beginTransaction and gtm_thread::begin_transaction.  */
>     pr_HTMRetryableAbort = 0x800000,
> -   pr_HTMRetriedAfterAbort = 0x1000000
> +   pr_HTMRetriedAfterAbort = 0x1000000,
> +   /* Reports whether a capacity abort was detected in the custom HTM fast
> +      path, so that it can be used in the abort handler in
> +      gtm_thread::begin_transaction.  */
> +   pr_HTMCapacityAbort          = 0x2000000
>  } _ITM_codeProperties;
>
>  /* Result from startTransaction that describes what actions to take.
> Index: method-serial.cc
> ===================================================================
> --- method-serial.cc (revision 219316)
> +++ method-serial.cc (working copy)
> @@ -223,6 +223,19 @@ struct htm_mg : public method_group
>      // initially disabled.
>  #ifdef USE_HTM_FASTPATH
>      htm_fastpath = htm_init();
> +    optimizer.optimized_mode = STUBBORN;
> +    optimizer.last_cycles = get_efficient_time();
> +    optimizer.last_total_txs_executed = 0;
> +    optimizer.last_throughput = optimizer.MIN_THROUGHPUT;
> +    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
> +    optimizer.best_ever_throughput = optimizer.last_throughput;
> +    optimizer.best_ever_attempts = htm_fastpath;
> +    optimizer.txns_while_stubborn = 1;
> +    optimizer.cycles_while_stubborn = optimizer.INIT_CYCLES;
> +    optimizer.txns_while_halven = 1;
> +    optimizer.cycles_while_halven = optimizer.INIT_CYCLES;
> +    optimizer.txns_while_giveup = 1;
> +    optimizer.cycles_while_giveup = optimizer.INIT_CYCLES;
>  #endif
>    }
>    virtual void fini()
> Index: retry.cc
> ===================================================================
> --- retry.cc (revision 219316)
> +++ retry.cc (working copy)
> @@ -218,7 +218,6 @@ GTM::gtm_thread::set_default_dispatch(GTM::abi_dis
>    default_dispatch.store(disp, memory_order_relaxed);
>  }
>
> -
>  static GTM::abi_dispatch*
>  parse_default_method()
>  {
> @@ -254,10 +253,17 @@ parse_default_method()
>        disp = GTM::dispatch_ml_wt();
>        env += 5;
>      }
> +  else if (strncmp(env, "htm_notune", 10) == 0)
> +    {
> +      disp = GTM::dispatch_htm();
> +      env += 10;
> +      GTM::optimizer.set_is_enabled(0);
> +    }
>    else if (strncmp(env, "htm", 3) == 0)
>      {
>        disp = GTM::dispatch_htm();
>        env += 3;
> +      GTM::optimizer.set_is_enabled(1);
>      }
>    else
>      goto unknown;
> Index: beginend.cc
> ===================================================================
> --- beginend.cc (revision 219316)
> +++ beginend.cc (working copy)
> @@ -39,6 +39,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
>  gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
>  atomic<gtm_version> GTM::gtm_clock;
>
> +// Holds the optimized configuration for parameters relevant to HTM execution.
> +gtm_global_optimizer GTM::optimizer;
> +
>  /* ??? Move elsewhere when we figure out library initialization.  */
>  uint64_t GTM::gtm_spin_count_var = 1000;
>
> @@ -95,7 +98,139 @@ thread_exit_init()
>      GTM_fatal("Creating thread release TLS key failed.");
>  }
>
> +/* Implementations for the fixed-point arithmetic type. */
> +const uint32_t GTM::fixed_pt_t::FIXED_PT_WIDTH = 64;
> +const uint32_t GTM::fixed_pt_t::FIXED_PT_INTEGER_WIDTH = 40;
> +const uint32_t GTM::fixed_pt_t::FIXED_PT_FRAC_WIDTH =
> +  GTM::fixed_pt_t::FIXED_PT_WIDTH - GTM::fixed_pt_t::FIXED_PT_INTEGER_WIDTH;
> +const fixed_pt_t GTM::fixed_pt_t::FIXED_PT_ONE = GTM::fixed_pt_t::one();
> +const fixed_pt_t GTM::fixed_pt_t::FIXED_PT_TWO =
> +  GTM::fixed_pt_t::FIXED_PT_ONE.add(GTM::fixed_pt_t::FIXED_PT_ONE);
>
> +fixed_pt_t
> +GTM::fixed_pt_t::one()
> +{
> +  fixed_pt_t result(0);
> +  result.number = static_cast<uint64_t>(1) << FIXED_PT_FRAC_WIDTH;
> +  return result;
> +}
> +
> +fixed_pt_t
> +GTM::fixed_pt_t::add(const fixed_pt_t operand) const
> +{
> +  fixed_pt_t result(0);
> +  result.number = this->number + operand.number;
> +  return result;
> +}
> +
> +fixed_pt_t
> +GTM::fixed_pt_t::sub(const fixed_pt_t operand) const
> +{
> +  fixed_pt_t result(0);
> +  result.number = this->number - operand.number;
> +  return result;
> +}
> +
> +fixed_pt_t
> +GTM::fixed_pt_t::mul(const fixed_pt_t operand) const
> +{
> +  fixed_pt_t result(0);
> +  result.number = (static_cast<__uint128_t>(this->number) *
> +     static_cast<__uint128_t>(operand.number)) >> FIXED_PT_FRAC_WIDTH;
> +  return result;
> +}
> +
> +fixed_pt_t
> +GTM::fixed_pt_t::div(const fixed_pt_t operand) const
> +{
> +  fixed_pt_t result(0);
> +  result.number =
> +    (static_cast<__uint128_t>(this->number) << FIXED_PT_FRAC_WIDTH) /
> +      static_cast<__uint128_t>(operand.number);
> +  return result;
> +}
> +
> +fixed_pt_t
> +GTM::fixed_pt_t::sqrt() const
> +{
> +  uint64_t invert = 0;
> +  uint64_t iter = FIXED_PT_FRAC_WIDTH;
> +  uint64_t i;
> +  fixed_pt_t n = *this;
> +
> +  if (n.number == 0 || n.number == FIXED_PT_ONE.number)
> +    {
> +      return n;
> +    }
> +  if (n.number < FIXED_PT_ONE.number && n.number > 6)
> +    {
> +      invert = 1;
> +      n = FIXED_PT_ONE.div(n);
> +    }
> +  if (n.number > FIXED_PT_ONE.number)
> +    {
> +      uint64_t s = n.number;
> +      iter = 0;
> +      while (s > 0)
> +        {
> +          s >>= 2;
> +          iter++;
> +        }
> +    }
> +
> +  fixed_pt_t l(0);
> +  l.number = (n.number >> 1) + 1;
> +  for (i = 0; i < iter; i++)
> +    {
> +      l.number = l.add(n.div(l)).number >> 1;
> +    }
> +  if (invert)
> +    {
> +      return FIXED_PT_ONE.div(l);
> +    }
> +  return l;
> +}
> +
> +fixed_pt_t
> +GTM::fixed_pt_t::ln() const
> +{
> +  const fixed_pt_t LN2 = fixed_pt_t(0.69314718055994530942);
> +  const fixed_pt_t LG[7] = {
> +    fixed_pt_t(6.666666666666735130e-01),
> +    fixed_pt_t(3.999999999940941908e-01),
> +    fixed_pt_t(2.857142874366239149e-01),
> +    fixed_pt_t(2.222219843214978396e-01),
> +    fixed_pt_t(1.818357216161805012e-01),
> +    fixed_pt_t(1.531383769920937332e-01),
> +    fixed_pt_t(1.479819860511658591e-01)
> +  };
> +
> +  if (this->number == 0)
> +    {
> +      return 0xffffffff;
> +    }
> +
> +  fixed_pt_t log2 = fixed_pt_t(0);
> +  fixed_pt_t xi = *this;
> +  while (xi.number > FIXED_PT_TWO.number)
> +    {
> +      xi.number >>= 1;
> +      log2.number++;
> +    }
> +
> +  fixed_pt_t f = xi.sub(FIXED_PT_ONE);
> +  fixed_pt_t s = f.div(FIXED_PT_TWO.add(f));
> +  fixed_pt_t z = s.mul(s);
> +  fixed_pt_t w = z.mul(z);
> +  fixed_pt_t R =
> +    w.mul(LG[1].add(w.mul(LG[3].add(w.mul(LG[5]))))).add(
> +      z.mul(LG[0].add(w.mul(LG[2].add(
> +        w.mul(LG[4].add(w.mul(LG[6]))))))));
> +
> +  return LN2.mul(fixed_pt_t(log2.number << FIXED_PT_FRAC_WIDTH)).add(
> +    f.sub(s.mul(f.sub(R))));
> +}
> +
>  GTM::gtm_thread::~gtm_thread()
>  {
>    if (nesting > 0)
> @@ -149,6 +284,180 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
>      return a_runInstrumentedCode;
>  }
>
> +// Periodically invoked by some thread(s) that are in the HTM fall-back path.
> +// This implements the Upper Confidence Bounds and Gradient Descent to tune,
> +// respectively, how to deal with HTM capacity aborts and how many HTM retries
> +// to allow. For more details, check Diegues et al. in USENIX ICAC 2014.
> +void
> +GTM::gtm_global_optimizer::reoptimize_htm_execution()
> +{
> +  // Avoid redundant re-optimization if another thread may be doing it already.
> +  // This allows the threads to avoid racing to re-optimize concurrently, which
> +  // is useless.
> +
> +  if (unlikely(GTM::gtm_thread::serial_lock.is_write_locked()))
> +    {
> +      return;
> +    }
> +  // Eventually, some thread(s) will follow this path, even in the face of
> +  // spurious short-cut exits due to the serial lock being taken.
> +  GTM::gtm_thread::serial_lock.write_lock();
> +
> +  // Collect the statistics obtained so far.
> +  uint64_t total_txs_executed = 0;
> +  gtm_thread *ptr = GTM::gtm_thread::list_of_threads;
> +  for (; ptr; ptr = ptr->next_thread)
> +    {
> +      total_txs_executed += ptr->number_executed_txns;
> +    }
> +
> +  // Obtain the delta performance with respect to the last period measured.
> +  // We expect the underlying architecture to provide an efficient way to
> +  // measure time passed since the last re-optimization.
> +  const uint64_t current_cycles = get_efficient_time();
> +  const uint64_t cycles_used = current_cycles - optimizer.last_cycles;
> +  const uint64_t new_txs_executed =
> +    total_txs_executed - optimizer.last_total_txs_executed;
> +  const fixed_pt_t current_throughput =
> +    fixed_pt_t(new_txs_executed).div(fixed_pt_t(cycles_used));
> +
> +  if (current_throughput.number == 0)
> +    {
> +      // This thread probably executed right after another one as they both
> +      // found it was time to re-optimize, but did not contend on the serial
> +      // lock. As such, we avoid redundant re-optimizations and short cut out.
> +      GTM::gtm_thread::serial_lock.write_unlock();
> +      return;
> +    }
> +
> +  optimizer.last_cycles = current_cycles;
> +  optimizer.last_total_txs_executed = total_txs_executed;
> +  if (optimizer.optimized_mode == STUBBORN)
> +    {
> +      optimizer.txns_while_stubborn += new_txs_executed;
> +      optimizer.cycles_while_stubborn += cycles_used;
> +    }
> +  else if (optimizer.optimized_mode == HALVEN)
> +    {
> +      optimizer.txns_while_halven += new_txs_executed;
> +      optimizer.cycles_while_halven += cycles_used;
> +    }
> +  else
> +    {
> +      optimizer.txns_while_giveup += new_txs_executed;
> +      optimizer.cycles_while_giveup += cycles_used;
> +    }
> +
> +  // Compute Upper Confidence Bounds for the mode to choose next to decide on
> +  // how to deal with HTM capacity aborts.
> +  // Use fixed point arithmetic types to spare floating point usage.
> +  const fixed_pt_t log_sum =
> +    GTM::fixed_pt_t::FIXED_PT_TWO.mul(
> +      (fixed_pt_t(optimizer.txns_while_stubborn).add(
> +       fixed_pt_t(optimizer.txns_while_halven)).add(
> +       fixed_pt_t(optimizer.txns_while_giveup))).ln());
> +  const fixed_pt_t ucb_stubborn =
> +    ((fixed_pt_t(optimizer.txns_while_stubborn).div(
> +      fixed_pt_t(optimizer.cycles_while_stubborn))).div(
> +      optimizer.best_ever_throughput)).add(
> +     (log_sum.div(fixed_pt_t(optimizer.txns_while_stubborn))).sqrt());
> +  const fixed_pt_t ucb_halven =
> +    ((fixed_pt_t(optimizer.txns_while_halven).div(
> +      fixed_pt_t(optimizer.cycles_while_halven))).div(
> +      optimizer.best_ever_throughput)).add(
> +     (log_sum.div(fixed_pt_t(optimizer.txns_while_halven))).sqrt());
> +  const fixed_pt_t ucb_giveup =
> +    ((fixed_pt_t(optimizer.txns_while_giveup).div(
> +      fixed_pt_t(optimizer.cycles_while_giveup)).div(
> +      optimizer.best_ever_throughput))).add(
> +     (log_sum.div(fixed_pt_t(optimizer.txns_while_giveup))).sqrt());
> +
> +  if (ucb_stubborn.number > ucb_halven.number &&
> +      ucb_stubborn.number > ucb_giveup.number)
> +    {
> +      optimizer.optimized_mode = STUBBORN;
> +    }
> +  else if (ucb_halven.number > ucb_giveup.number)
> +    {
> +      optimizer.optimized_mode = HALVEN;
> +    }
> +  else
> +    {
> +      optimizer.optimized_mode = GIVEUP;
> +    }
> +
> +  // Compute Gradient Descent to regulate the number of retries in HTM.
> +  const fixed_pt_t LARGE_DEGRADATION = fixed_pt_t(1.40);
> +  const fixed_pt_t MIN_IMPROVEMENT = fixed_pt_t(1.05);
> +  const int32_t MAX_RETRIES = 20;
> +  const int32_t MIN_RETRIES = 2;
> +
> +  const fixed_pt_t change_for_better =
> +    current_throughput.div(optimizer.last_throughput);
> +  const fixed_pt_t change_for_worse =
> +    optimizer.last_throughput.div(current_throughput);
> +  int32_t last_attempts = optimizer.last_attempts;
> +  int32_t current_attempts = htm_fastpath;
> +  int32_t new_attempts = current_attempts;
> +  if (unlikely(change_for_worse.number > fixed_pt_t(LARGE_DEGRADATION).number))
> +    {
> +      htm_fastpath = optimizer.best_ever_attempts;
> +      optimizer.last_throughput = current_throughput;
> +      optimizer.last_attempts = current_attempts;
> +      goto finish_reoptimization;
> +    }
> +
> +  if (unlikely(random_int(100) == 0))
> +    {
> +      optimizer.last_attempts = htm_fastpath;
> +      optimizer.last_throughput = current_throughput;
> +      htm_fastpath = random_int(MAX_RETRIES - MIN_RETRIES) + MIN_RETRIES;
> +      goto finish_reoptimization;
> +    }
> +
> +  if (change_for_better.number > MIN_IMPROVEMENT.number)
> +    {
> +      new_attempts += current_attempts - last_attempts;
> +      if (current_attempts == last_attempts)
> + {
> +  new_attempts += random_int(2) == 0 ? -1 : 1;
> + }
> +    }
> +  else if (change_for_worse.number > MIN_IMPROVEMENT.number)
> +    {
> +      new_attempts -= current_attempts - last_attempts;
> +      if (current_attempts == last_attempts)
> + {
> +  new_attempts += random_int(2) == 0 ? -1 : 1;
> + }
> +    }
> +
> +  if (unlikely(new_attempts > MAX_RETRIES))
> +    {
> +      new_attempts = MAX_RETRIES;
> +    }
> +  else if (unlikely(new_attempts < MIN_RETRIES))
> +    {
> +      new_attempts = MIN_RETRIES;
> +    }
> +
> +  if (current_attempts != new_attempts)
> +    {
> +      optimizer.last_attempts = current_attempts;
> +      optimizer.last_throughput = current_throughput;
> +    }
> +
> +  htm_fastpath = new_attempts;
> +  if (unlikely(current_throughput.number >
> optimizer.best_ever_throughput.number))
> +    {
> +      optimizer.best_ever_throughput = current_throughput;
> +      optimizer.best_ever_attempts = current_attempts;
> +    }
> +
> + finish_reoptimization:
> +  GTM::gtm_thread::serial_lock.write_unlock();
> +}
> +
>  uint32_t
>  GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
>  {
> @@ -209,8 +518,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>      }
>    // The transaction has aborted.  Don't retry if it's unlikely that
>    // retrying the transaction will be successful.
> -  if (!htm_abort_should_retry(ret))
> -    break;
> +  if (optimizer.is_enabled)
> +    {
> +      // If the HTM self-tuning is enabled, then we act according to
> +      // its configuration.
> +      if (htm_is_capacity_abort(ret))
> + {
> +  gtm_capacity_abort_mode capacity_mode =
> +    optimizer.optimized_mode;
> +  // Adapt the retries according to the mode. Note that the
> +  // linear decrease is implicit in the outer for loop.
> +  if (capacity_mode == HALVEN)
> +    t = t / 2 + 1;
> +  else if (capacity_mode == GIVEUP)
> +    break;
> + }
> +
> +      // Take the chance of having aborted to possibly re-optimize the
> +      // the HTM configuration. Do this as a last resort before going
> +      // for the serial lock.
> +      if (unlikely(tx->number_executed_txns %
> +   optimizer.OPTIMIZATION_PERIOD_TXS == 0 && t == 1))
> + optimizer.reoptimize_htm_execution();
> +    }
> +  else
> +    {
> +      // Execute the fixed logic when there is no adaptation.
> +      if (!htm_abort_should_retry(ret))
> + break;
> +    }
> +
>    // Wait until any concurrent serial-mode transactions have finished.
>    // This is an empty critical section, but won't be elided.
>    if (serial_lock.is_write_locked())
> @@ -248,7 +585,11 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>    // HTM fastpath aborted, and that we thus have to decide whether to retry
>    // the fastpath (returning a_tryHTMFastPath) or just proceed with the
>    // fallback method.
> -  if (likely(htm_fastpath && (prop & pr_HTMRetryableAbort)))
> +  // Even if pr_HTMRetryableAbort is not true, we still consider to retry if
> +  // it was a capacity abort (stated by pr_HTMCapacityAbort) and the self
> +  // tuning capabilities are enabled (stated by optimizer.is_enabled).
> +  if (likely(htm_fastpath && ((prop & pr_HTMRetryableAbort)
> +      || ((prop & pr_HTMCapacityAbort) && (optimizer.is_enabled)))))
>      {
>        tx = gtm_thr();
>        if (unlikely(tx == NULL))
> @@ -266,6 +607,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
>
>        if (--tx->restart_total > 0)
>   {
> +  // See above: we use the optimized configuration if enabled.
> +  if (optimizer.is_enabled)
> +    {
> +      // See above: similarly, we adapt upon capacity aborts.
> +      if (prop & pr_HTMCapacityAbort)
> + {
> +  gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
> +  if (capacity_mode == HALVEN)
> +    tx->restart_total = tx->restart_total / 2 + 1;
> +  else if (capacity_mode == GIVEUP)
> +    goto stop_custom_htm_fastpath;
> + }
> +
> +      // See above: similarly, we periodically re-optimize.
> +      if (unlikely(tx->number_executed_txns %
> +  optimizer.OPTIMIZATION_PERIOD_TXS == 0 &&
> +  tx->restart_total == 1))
> + optimizer.reoptimize_htm_execution();
> +    }
> +
>    // Wait until any concurrent serial-mode transactions have finished.
>    // Essentially the same code as above.
>    if (serial_lock.is_write_locked())
> @@ -665,12 +1026,23 @@ _ITM_commitTransaction(void)
>    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
>      {
>        htm_commit();
> +      gtm_thread *tx = gtm_thr();
> +      if (unlikely(tx == NULL))
> + {
> +  // See above: when using the HTM fast-path, we might not have created
> +  // a thread object yet.
> +  tx = new gtm_thread();
> +  set_gtm_thr(tx);
> + }
> +      tx->number_executed_txns++;
>        return;
>      }
>  #endif
>    gtm_thread *tx = gtm_thr();
>    if (!tx->trycommit ())
>      tx->restart (RESTART_VALIDATE_COMMIT);
> +
> +  tx->number_executed_txns++;
>  }
>
>  void ITM_REGPARM
> @@ -681,6 +1053,14 @@ _ITM_commitTransactionEH(void *exc_ptr)
>    if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
>      {
>        htm_commit();
> +      gtm_thread *tx = gtm_thr();
> +      if (unlikely(tx == NULL))
> + {
> +  // See above.
> +  tx = new gtm_thread();
> +  set_gtm_thr(tx);
> + }
> +      tx->number_executed_txns++;
>        return;
>      }
>  #endif
> @@ -690,4 +1070,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
>        tx->eh_in_flight = exc_ptr;
>        tx->restart (RESTART_VALIDATE_COMMIT);
>      }
> +  tx->number_executed_txns++;
>  }
> Index: config/s390/target.h
> ===================================================================
> --- config/s390/target.h (revision 219316)
> +++ config/s390/target.h (working copy)
> @@ -116,12 +116,30 @@ htm_abort_should_retry (uint32_t begin_ret)
>  }
>
>  static inline bool
> +htm_is_capacity_abort (uint32_t begin_ret)
> +{
> +  return begin_ret != _HTM_TBEGIN_PERSISTENT;
> +}
> +
> +static inline bool
>  htm_transaction_active ()
>  {
>    __asm volatile (".machine \"all\"  \n\t");
>    return __builtin_tx_nesting_depth() != 0;
>  }
>
> +static inline uint64_t
> +get_efficient_time()
> +{
> +    return 0;
> +}
> +
> +static inline bool
> +allows_reoptimization()
> +{
> +  return false;
> +}
> +
>  #endif
>
>  } // namespace GTM
> Index: config/arm/target.h
> ===================================================================
> --- config/arm/target.h (revision 219316)
> +++ config/arm/target.h (working copy)
> @@ -46,4 +46,10 @@ cpu_relax (void)
>    __sync_synchronize ();
>  }
>
> +static inline bool
> +allows_reoptimization()
> +{
> +  return false;
> +}
> +
>  } // namespace GTM
> Index: config/powerpc/target.h
> ===================================================================
> --- config/powerpc/target.h (revision 219316)
> +++ config/powerpc/target.h (working copy)
> @@ -74,6 +74,7 @@ cpu_relax (void)
>  #define _TBEGIN_STARTED       0
>  #define _TBEGIN_INDETERMINATE 1
>  #define _TBEGIN_PERSISTENT    2
> +#define _TBEGIN_CAPACITY      3
>
>  /* Number of retries for transient failures.  */
>  #define _HTM_RETRIES 10
> @@ -101,6 +102,9 @@ htm_begin (void)
>    if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
>      return _TBEGIN_PERSISTENT;
>
> +  if (_TEXASRU_FOOTPRINT_OVERFLOW (__builtin_get_texasru ()))
> +    return _TBEGIN_CAPACITY;
> +
>    return _TBEGIN_INDETERMINATE;
>  }
>
> @@ -128,6 +132,12 @@ htm_abort_should_retry (uint32_t begin_ret)
>    return begin_ret != _TBEGIN_PERSISTENT;
>  }
>
> +static inline bool
> +htm_is_capacity_abort (uint32_t begin_ret)
> +{
> +  return begin_ret == _TBEGIN_CAPACITY;
> +}
> +
>  /* Returns true iff a hardware transaction is currently being executed.  */
>  static inline bool
>  htm_transaction_active (void)
> @@ -135,6 +145,36 @@ htm_transaction_active (void)
>    return (_HTM_STATE (__builtin_ttest ()) == _HTM_TRANSACTIONAL);
>  }
>
> +static inline uint64_t
> +get_efficient_time()
> +{
> +  #if defined (__powerpc64__) || defined (__ppc64__)
> +  uint64_t __tb;
> +  __asm__ volatile ("mfspr %[tb], 268\n"
> +    : [tb]"=r" (__tb)
> +    : );
> +  return __tb;
> +  #else
> +  register unsigned long __tbu, __tbl, __tmp; \
> +  __asm__ volatile (
> +    "0:\n"
> +    "mftbu %[tbu]\n"
> +    "mftbl %[tbl]\n"
> +    "mftbu %[tmp]\n"
> +    "cmpw %[tbu], %[tmp]\n"
> +    "bne- 0b\n"
> +    : [tbu]"=r" (__tbu), [tbl]"=r" (__tbl), [tmp]"=r" (__tmp)
> +    : );
> +  return (( (uint64_t) __tbu << 32) | __tbl);
> +  #endif /* not __powerpc64__ */
> +}
> +
> +static inline bool
> +allows_reoptimization()
> +{
> +  return true;
> +}
> +
>  #endif
>
>  } // namespace GTM
> Index: config/alpha/target.h
> ===================================================================
> --- config/alpha/target.h (revision 219316)
> +++ config/alpha/target.h (working copy)
> @@ -41,4 +41,10 @@ cpu_relax (void)
>    __asm volatile ("" : : : "memory");
>  }
>
> +static inline bool
> +allows_reoptimization()
> +{
> +  return false;
> +}
> +
>  } // namespace GTM
> Index: config/x86/sjlj.S
> ===================================================================
> --- config/x86/sjlj.S (revision 219316)
> +++ config/x86/sjlj.S (working copy)
> @@ -59,12 +59,14 @@
>  #define pr_hasNoAbort 0x08
>  #define pr_HTMRetryableAbort 0x800000
>  #define pr_HTMRetriedAfterAbort 0x1000000
> +#define pr_HTMCapacityAbort     0x2000000
>  #define a_runInstrumentedCode 0x01
>  #define a_runUninstrumentedCode 0x02
>  #define a_tryHTMFastPath 0x20
>
>  #define _XABORT_EXPLICIT (1 << 0)
>  #define _XABORT_RETRY (1 << 1)
> +#define _XABORT_CAPACITY (1 << 3)
>
>   .text
>
> @@ -111,6 +113,9 @@ SYM(_ITM_beginTransaction):
>   testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
>   jz .Lno_htm
>   orl $pr_HTMRetryableAbort, %edi
> + testl   $(_XABORT_CAPACITY), %eax
> + jz      .Lno_htm
> + orl     $pr_HTMCapacityAbort, %edi
>   /* Let the C++ code handle the retry policy.  */
>  .Lno_htm:
>  #endif
> Index: config/x86/target.h
> ===================================================================
> --- config/x86/target.h (revision 219316)
> +++ config/x86/target.h (working copy)
> @@ -126,13 +126,33 @@ htm_abort_should_retry (uint32_t begin_ret)
>    return begin_ret & _XABORT_RETRY;
>  }
>
> +static inline bool
> +htm_is_capacity_abort(uint32_t begin_ret)
> +{
> +  return begin_ret & _XABORT_CAPACITY;
> +}
> +
>  /* Returns true iff a hardware transaction is currently being executed.  */
>  static inline bool
>  htm_transaction_active ()
>  {
>    return _xtest() != 0;
>  }
> +
> +static inline uint64_t
> +get_efficient_time()
> +{
> +  uint32_t hi, lo;
> +  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
> +  return ((unsigned long long) lo) | (((unsigned long long) hi) << 32);
> +}
> +
> +static inline bool
> +allows_reoptimization()
> +{
> +  return true;
> +}
> +
>  #endif
>
> -
>  } // namespace GTM
> Index: config/aarch64/target.h
> ===================================================================
> --- config/aarch64/target.h (revision 219316)
> +++ config/aarch64/target.h (working copy)
> @@ -42,4 +42,10 @@ cpu_relax (void)
>    __asm volatile ("" : : : "memory");
>  }
>
> +static inline bool
> +allows_reoptimization()
> +{
> +  return false;
> +}
> +
>  } // namespace GTM
> Index: config/sparc/target.h
> ===================================================================
> --- config/sparc/target.h (revision 219316)
> +++ config/sparc/target.h (working copy)
> @@ -39,4 +39,10 @@ cpu_relax (void)
>    __asm volatile ("rd %%ccr, %%g0" : : : "memory");
>  }
>
> +static inline bool
> +allows_reoptimization()
> +{
> +  return false;
> +}
> +
>  } // namespace GTM
> Index: config/sh/target.h
> ===================================================================
> --- config/sh/target.h (revision 219316)
> +++ config/sh/target.h (working copy)
> @@ -44,4 +44,10 @@ cpu_relax (void)
>    __asm volatile ("" : : : "memory");
>  }
>
> +static inline bool
> +allows_reoptimization()
> +{
> +  return false;
> +}
> +
>  } // namespace GTM
> Index: libitm_i.h
> ===================================================================
> --- libitm_i.h (revision 219316)
> +++ libitm_i.h (working copy)
> @@ -242,6 +242,9 @@ struct gtm_thread
>    uint32_t restart_reason[NUM_RESTARTS];
>    uint32_t restart_total;
>
> +  // Keeps track of how many transactions were successfully executed.
> +  uint64_t number_executed_txns;
> +
>    // *** The shared part of gtm_thread starts here. ***
>    // Shared state is on separate cachelines to avoid false sharing with
>    // thread-local parts of gtm_thread.
> @@ -309,6 +312,94 @@ struct gtm_thread
>    void commit_user_actions ();
>  };
>
> +// Definition of fixed point arithmetic types.
> +// Half the bits are dedicated to the fractional type, and the rest to the
> +// "whole" part.
> +struct fixed_pt_t
> +{
> +  uint64_t number;
> +
> +  fixed_pt_t(double n)
> +  {
> +    this->number = static_cast<uint64_t>(n * FIXED_PT_ONE.number + (n
>>= 0 ? 0.5 : -0.5));
> +  }
> +
> +  fixed_pt_t add(const fixed_pt_t operand) const;
> +  fixed_pt_t sub(const fixed_pt_t operand) const;
> +  fixed_pt_t mul(const fixed_pt_t operand) const;
> +  fixed_pt_t div(const fixed_pt_t operand) const;
> +  fixed_pt_t sqrt() const;
> +  fixed_pt_t ln() const;
> +
> +  static fixed_pt_t one();
> +
> +  static const uint32_t FIXED_PT_WIDTH;
> +  static const uint32_t FIXED_PT_INTEGER_WIDTH;
> +  static const uint32_t FIXED_PT_FRAC_WIDTH;
> +  static const fixed_pt_t FIXED_PT_ONE;
> +  static const fixed_pt_t FIXED_PT_TWO;
> +};
> +
> +// Different ways to deal with capacity aborts in HTM execution.
> +enum gtm_capacity_abort_mode
> +{
> +  STUBBORN,
> +  HALVEN,
> +  GIVEUP
> +};
> +
> +// Maintains the current values optimized for HTM execution and the
> +// corresponding statistics gathered for the decision-making.
> +struct gtm_global_optimizer
> +{
> +  // Whether to re-optimize the HTM execution. This is parametrizable by an
> +  // env variable.
> +  uint32_t is_enabled;
> +
> +  // Mode chosen to currently deal with capacity aborts.
> +  gtm_capacity_abort_mode optimized_mode;
> +  // The number of attempts chosen to currently insist on HTM execution is
> +  // maintained in htm_fastpath.
> +
> +  uint64_t last_cycles;
> +  uint64_t last_total_txs_executed;
> +
> +  fixed_pt_t last_throughput;
> +  uint32_t last_attempts;
> +
> +  fixed_pt_t best_ever_throughput;
> +  uint32_t best_ever_attempts;
> +
> +  uint64_t txns_while_stubborn;
> +  uint64_t cycles_while_stubborn;
> +  uint64_t txns_while_halven;
> +  uint64_t cycles_while_halven;
> +  uint64_t txns_while_giveup;
> +  uint64_t cycles_while_giveup;
> +
> +  gtm_global_optimizer() :
> +    last_throughput(fixed_pt_t(0.0001)),
> +    best_ever_throughput(fixed_pt_t(0.0001))
> +  {
> +  }
> +
> +  void
> +  set_is_enabled(bool arg)
> +  {
> +    is_enabled = allows_reoptimization() && arg;
> +  }
> +
> +  void reoptimize_htm_execution();
> +
> +  // Period, in number of transactions, between which we execute the
> +  // re-optimization procedure.
> +  static const uint32_t OPTIMIZATION_PERIOD_TXS = 500;
> +  // Initialization value for the last, fictional, throughput measured.
> +  static const fixed_pt_t MIN_THROUGHPUT = fixed_pt_t(0.001);
> +  // This serves as an initialization for the cycles used so far.
> +  static const uint32_t INIT_CYCLES = 100;
> +};
> +
>  } // namespace GTM
>
>  #include "tls.h"
> @@ -346,6 +437,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
>  // the name, avoiding complex name mangling.
>  extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");
>
> +// Maintains the optimization for HTM execution.
> +extern gtm_global_optimizer optimizer;
> +
>  } // namespace GTM
>
>  #endif // LIBITM_I_H
> Index: common.h
> ===================================================================
> --- common.h (revision 219316)
> +++ common.h (working copy)
> @@ -59,6 +59,11 @@ extern void * xcalloc (size_t s, bool separate_cl
>  extern void * xrealloc (void *p, size_t s, bool separate_cl = false)
>    __attribute__((malloc, nothrow));
>
> +// Linear congruential number generator.
> +//
> +// Used also in glibc for generators with small state.
> +extern uint32_t random_int(uint32_t max);
> +
>  } // namespace GTM

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  2015-11-02  8:58                       ` Nuno Diegues
@ 2015-11-02 20:59                         ` Andi Kleen
  0 siblings, 0 replies; 20+ messages in thread
From: Andi Kleen @ 2015-11-02 20:59 UTC (permalink / raw)
  To: Nuno Diegues; +Cc: Torvald Riegel, gcc-patches, Paolo Romano

Nuno Diegues <nmld@ist.utl.pt> writes:

> Hello everyone,
>
> gently pinging to bring this back to life given the last patch I emailed.

The patch is fine for me, but I cannot approve it.
-Andi

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2015-11-02 20:59 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-07 17:36 [PATCH] add self-tuning to x86 hardware fast path in libitm Nuno Diegues
2015-04-08 14:05 ` Andi Kleen
2015-04-08 15:29   ` Nuno Diegues
2015-04-08 17:55     ` Andi Kleen
2015-04-09 10:42       ` Nuno Diegues
2015-04-10 12:24         ` Andi Kleen
2015-04-30  3:52           ` Nuno Diegues
2015-04-30  5:00             ` Andi Kleen
2015-04-30 12:13               ` Nuno Diegues
2015-05-05 12:49                 ` Nuno Diegues
2015-05-14 22:21             ` [PING] " Andi Kleen
2015-05-18 21:31             ` Torvald Riegel
2015-05-18 21:48               ` Andi Kleen
2015-05-18 22:04                 ` Torvald Riegel
2015-05-19  4:54               ` Nuno Diegues
2015-05-19 22:31                 ` Torvald Riegel
2015-06-14 21:49                   ` Nuno Diegues
2015-08-24 12:18                     ` Nuno Diegues
2015-11-02  8:58                       ` Nuno Diegues
2015-11-02 20:59                         ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).