Uprobes is enhanced to use "single-stepping out of line" (SSOL)
to avoid probe misses in multithreaded applications.  SSOL also
reduces probe overhead by 25-30%.

After a breakpoint has been hit and uprobes has run the probepoint's
handler(s), uprobes must execute the probed instruction in the
context of the probed process.  There are two commonly accepted
ways to do this:

o Single-stepping inline (SSIL): Temporarily replace the breakpoint
instruction with the original instruction; single-step the instruction;
restore the breakpoint instruction; and allow the thread to continue.
This method is typically used by interactive debuggers such as gdb,
and is also used in the uprobes base patch.  This approach doesn't
work acceptably for multithreaded programs, because while the
breakpoint is temporarily removed, other threads can sail past the
probepoint.  It also requires two writes to the probed process's
text for every probe hit.

o Single-stepping out of line (SSOL): Place a copy of the original
instruction somewhere in the probed process's address space;
single-step the copy; fix up the thread state as necessary; and allow
the thread to continue.  This approach is used by kprobes.

This implementation of SSOL entails two major components:

1) Allocation and management of an "SSOL area."  Before handling
the first probe hit, uprobes allocates a VM area in the probed
process's address space, and divides it into "instruction slots."
The first time a probepoint is hit, an instruction slot is allocated
to it and a copy of the probed instruction is placed there.  Multiple
threads can march through that probepoint simultaneously, all using
the same slot.  Currently, we allocate a VM area only for probed
processes (rather than at exec time for every process), its size
is one page, and it never grows.  Slots are recycled, as necessary,
on a least-recently-used basis.

2) Architecture-specific fix-ups for certain instructions.  If the
effect of an instruction depends on its address, the thread's
registers and/or stack must be fixed up after the instruction-copy
is single-stepped.  For i386 uprobes, the fixups were stolen from
i386 kprobes.

---

 Documentation/uprobes.txt  |   25 +--
 arch/i386/Kconfig          |    4 
 arch/i386/kernel/Makefile  |    1 
 arch/i386/kernel/uprobes.c |  135 +++++++++++++++++
 include/asm-i386/mmu.h     |    1 
 include/linux/uprobes.h    |   87 +++++++++++
 kernel/uprobes.c           |  342 +++++++++++++++++++++++++++++++++++++++++++--
 7 files changed, 569 insertions(+), 26 deletions(-)

diff -puN Documentation/uprobes.txt~2-uprobes-ssol Documentation/uprobes.txt
--- linux-2.6.21-rc6/Documentation/uprobes.txt~2-uprobes-ssol	2007-05-24 15:41:01.000000000 -0700
+++ linux-2.6.21-rc6-jimk/Documentation/uprobes.txt	2007-05-24 15:41:28.000000000 -0700
@@ -54,14 +54,13 @@ handler the addresses of the uprobe stru
 The handler may block, but keep in mind that the probed thread remains
 stopped while your handler runs.
 
-Next, Uprobes single-steps the probed instruction and resumes execution
-of the probed process at the instruction following the probepoint.
-[Note: In the base uprobes patch, we temporarily remove the breakpoint
-instruction, insert the original opcode, single-step the instruction
-"inline", and then replace the breakpoint.  This can create problems
-in a multithreaded application.  For example, it opens a time window
-during which another thread can sail right past the probepoint.
-This problem is resolved in the "single-stepping out of line" patch.]
+Next, Uprobes single-steps its copy of the probed instruction and
+resumes execution of the probed process at the instruction following
+the probepoint.  (It would be simpler to single-step the actual
+instruction in place, but then Uprobes would have to temporarily
+remove the breakpoint instruction.  This would create problems in a
+multithreaded application.  For example, it would open a time window
+when another thread could sail right past the probepoint.)
 
 1.2 The Role of Utrace
 
@@ -287,15 +286,15 @@ create a new set of uprobe objects.)
 8. Probe Overhead
 
 // TODO: Adjust as other architectures are tested.
-On a typical CPU in use in 2007, a uprobe hit takes 3 to 4
-microseconds to process.  Specifically, a benchmark that hits the same
-probepoint repeatedly, firing a simple handler each time, reports
-250,000 to 300,000 hits per second, depending on the architecture.
+On a typical CPU in use in 2007, a uprobe hit takes about 3
+microseconds to process.  Specifically, a benchmark that hits the
+same probepoint repeatedly, firing a simple handler each time, reports
+300,000 to 350,000 hits per second, depending on the architecture.
 
 Here are sample overhead figures (in usec) for different architectures.
 
 i386: Intel Pentium M, 1495 MHz, 2957.31 bogomips
-4.2 usec/hit (single-stepping inline)
+2.9 usec/hit (single-stepping out of line)
 
 x86_64: AMD Opteron 246, 1994 MHz, 3971.48 bogomips
 // TODO
diff -puN arch/i386/Kconfig~2-uprobes-ssol arch/i386/Kconfig
--- linux-2.6.21-rc6/arch/i386/Kconfig~2-uprobes-ssol	2007-05-24 15:41:01.000000000 -0700
+++ linux-2.6.21-rc6-jimk/arch/i386/Kconfig	2007-05-24 15:41:28.000000000 -0700
@@ -87,6 +87,10 @@ config DMI
 	bool
 	default y
 
+config UPROBES_SSOL
+	bool
+	default y
+
 source "init/Kconfig"
 
 menu "Processor type and features"
diff -puN arch/i386/kernel/Makefile~2-uprobes-ssol arch/i386/kernel/Makefile
--- linux-2.6.21-rc6/arch/i386/kernel/Makefile~2-uprobes-ssol	2007-05-24 15:41:01.000000000 -0700
+++ linux-2.6.21-rc6-jimk/arch/i386/kernel/Makefile	2007-05-24 15:41:28.000000000 -0700
@@ -41,6 +41,7 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_prin
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_STACK_UNWIND)	+= unwind.o
+obj-$(CONFIG_UPROBES)		+= uprobes.o
 
 obj-$(CONFIG_VMI)		+= vmi.o vmitime.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
diff -puN /dev/null arch/i386/kernel/uprobes.c
--- /dev/null	2007-05-25 07:05:01.545112516 -0700
+++ linux-2.6.21-rc6-jimk/arch/i386/kernel/uprobes.c	2007-05-24 15:41:28.000000000 -0700
@@ -0,0 +1,135 @@
+/*
+ *  Userspace Probes (UProbes)
+ *  arch/i386/kernel/uprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ */
+#define UPROBES_IMPLEMENTATION 1
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+
+/*
+ * Get an instruction slot from the process's SSOL area, containing the
+ * instruction at ppt's probepoint.  Point the eip at that slot, in
+ * preparation for single-stepping out of line.
+ */
+void uprobe_pre_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt,
+		struct pt_regs *regs)
+{
+	struct uprobe_ssol_slot *slot;
+
+	slot = uprobe_get_insn_slot(ppt);
+	if (!slot) {
+		utask->doomed = 1;
+		return;
+	}
+	regs->eip = (long)slot->insn;
+	utask->singlestep_addr = regs->eip;
+}
+
+/*
+ * Called by uprobe_post_ssout() to adjust the return address
+ * pushed by a call instruction executed out-of-line.
+ */
+static void adjust_ret_addr(long esp, long correction,
+		struct uprobe_task *utask)
+{
+	int nleft;
+	long ra;
+
+	nleft = copy_from_user(&ra, (const void __user *) esp, 4);
+	if (unlikely(nleft != 0))
+		goto fail;
+	ra +=  correction;
+	nleft = copy_to_user((void __user *) esp, &ra, 4);
+	if (unlikely(nleft != 0))
+		goto fail;
+	return;
+
+fail:
+	printk(KERN_ERR
+		"uprobes: Failed to adjust return address after"
+		" single-stepping call instruction;"
+		" pid=%d, esp=%#lx\n", current->pid, esp);
+	utask->doomed = 1;
+}
+
+/*
+ * Called after single-stepping.  ppt->vaddr is the address of the
+ * instruction whose first byte has been replaced by the "int3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is utask->singlestep_addr.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Typically, the new eip is relative to the copied instruction.  We
+ * need to make it relative to the original instruction.  Exceptions are
+ * return instructions and absolute or indirect jump or call instructions.
+ *
+ * 1) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ */
+void uprobe_post_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt,
+		struct pt_regs *regs)
+{
+	long next_eip = 0;
+	long copy_eip = utask->singlestep_addr;
+	long orig_eip = ppt->vaddr;
+
+	up_read(&ppt->slot->rwsem);
+
+	switch (ppt->insn[0]) {
+	case 0xc3:		/* ret/lret */
+	case 0xcb:
+	case 0xc2:
+	case 0xca:
+		next_eip = regs->eip;
+		/* eip is already adjusted, no more changes required*/
+		break;
+	case 0xe8:		/* call relative - Fix return addr */
+		adjust_ret_addr(regs->esp, (orig_eip - copy_eip), utask);
+		break;
+	case 0xff:
+		if ((ppt->insn[1] & 0x30) == 0x10) {
+			/* call absolute, indirect */
+			/* Fix return addr; eip is correct. */
+			next_eip = regs->eip;
+			adjust_ret_addr(regs->esp, (orig_eip - copy_eip),
+				utask);
+		} else if ((ppt->insn[1] & 0x31) == 0x20 ||
+			   (ppt->insn[1] & 0x31) == 0x21) {
+			/* jmp near or jmp far  absolute indirect */
+			/* eip is correct. */
+			next_eip = regs->eip;
+		}
+		break;
+	case 0xea:		/* jmp absolute -- eip is correct */
+		next_eip = regs->eip;
+		break;
+	default:
+		break;
+	}
+
+	if (next_eip)
+		regs->eip = next_eip;
+	else
+		regs->eip = orig_eip + (regs->eip - copy_eip);
+}
diff -puN include/asm-i386/mmu.h~2-uprobes-ssol include/asm-i386/mmu.h
--- linux-2.6.21-rc6/include/asm-i386/mmu.h~2-uprobes-ssol	2007-05-24 15:41:01.000000000 -0700
+++ linux-2.6.21-rc6-jimk/include/asm-i386/mmu.h	2007-05-24 15:41:28.000000000 -0700
@@ -13,6 +13,7 @@ typedef struct { 
 	struct semaphore sem;
 	void *ldt;
 	void *vdso;
+	void *uprobes_ssol_area;
 } mm_context_t;
 
 #endif
diff -puN include/linux/uprobes.h~2-uprobes-ssol include/linux/uprobes.h
--- linux-2.6.21-rc6/include/linux/uprobes.h~2-uprobes-ssol	2007-05-24 15:41:01.000000000 -0700
+++ linux-2.6.21-rc6-jimk/include/linux/uprobes.h	2007-05-24 15:41:28.000000000 -0700
@@ -87,6 +87,60 @@ enum uprobe_task_state {
 
 #define UPROBE_HASH_BITS 5
 #define UPROBE_TABLE_SIZE (1 << UPROBE_HASH_BITS)
+#define UINSNS_PER_PAGE (PAGE_SIZE/MAX_UINSN_BYTES)
+
+/* Used when deciding which instruction slot to steal. */
+enum uprobe_slot_state {
+	SSOL_FREE,
+	SSOL_ASSIGNED,
+	SSOL_BEING_STOLEN
+};
+
+/*
+ * For a uprobe_process that uses an SSOL area, there's an array of these
+ * objects matching the array of instruction slots in the SSOL area.
+ */
+struct uprobe_ssol_slot {
+	/* The slot in the SSOL area that holds the instruction-copy */
+	__user uprobe_opcode_t	*insn;
+
+	enum uprobe_slot_state state;
+
+	/* The probepoint that currently owns this slot */
+	struct uprobe_probept *owner;
+
+	/*
+	 * Read-locked when slot is in use during single-stepping.
+	 * Write-locked by stealing task.
+	 */
+	struct rw_semaphore rwsem;
+
+	/* Used for LRU heuristics.  If this overflows, it's OK. */
+	unsigned long last_used;
+};
+
+/*
+ * The per-process single-stepping out-of-line (SSOL) area
+ */
+struct uprobe_ssol_area {
+	/* Array of instruction slots in the vma we allocate */
+	__user uprobe_opcode_t *insn_area;
+
+	int nslots;
+	int nfree;
+
+	/* Array of slot objects, one per instruction slot */
+	struct uprobe_ssol_slot *slots;
+
+	/* lock held while finding a free slot */
+	spinlock_t lock;
+
+	/* Next slot to steal */
+	int next_slot;
+
+	/* Ensures 2 threads don't try to set up the vma simultaneously. */
+	struct mutex setup_mutex;
+};
 
 /*
  * uprobe_process -- not a user-visible struct.
@@ -136,6 +190,18 @@ struct uprobe_process {
 	 * since once the last thread has exited, the rest is academic.
 	 */
 	struct kref refcount;
+
+	/*
+	 * Manages slots for instruction-copies to be single-stepped
+	 * out of line.
+	 */
+	struct uprobe_ssol_area ssol_area;
+
+	/*
+	 * 1 to single-step out of line; 0 for inline.  This can drop to
+	 * 0 if we can't set up the SSOL area, but never goes from 0 to 1.
+	 */
+	int sstep_out_of_line;
 };
 
 /*
@@ -204,6 +270,19 @@ struct uprobe_probept {
 	 * prevent probe misses while the breakpoint is swapped out.
 	 */
 	struct mutex ssil_mutex;
+
+	/*
+	 * We put the instruction-copy here to single-step it.
+	 * We don't own it unless slot->owner points back to us.
+	 */
+	struct uprobe_ssol_slot *slot;
+
+	/*
+	 * Hold this while stealing an insn slot to ensure that no
+	 * other thread, having also hit this probepoint, simultaneously
+	 * steals a slot for it.
+	 */
+	struct mutex slot_mutex;
 };
 
 /*
@@ -248,6 +327,14 @@ struct uprobe_task {
 	int doomed;
 };
 
+#ifdef CONFIG_UPROBES_SSOL
+extern struct uprobe_ssol_slot *uprobe_get_insn_slot(struct uprobe_probept*);
+extern void uprobe_pre_ssout(struct uprobe_task*, struct uprobe_probept*,
+			struct pt_regs*);
+extern void uprobe_post_ssout(struct uprobe_task*, struct uprobe_probept*,
+			struct pt_regs*);
+#endif
+
 #endif	/* UPROBES_IMPLEMENTATION */
 
 #endif	/* _LINUX_UPROBES_H */
diff -puN kernel/uprobes.c~2-uprobes-ssol kernel/uprobes.c
--- linux-2.6.21-rc6/kernel/uprobes.c~2-uprobes-ssol	2007-05-24 15:41:01.000000000 -0700
+++ linux-2.6.21-rc6-jimk/kernel/uprobes.c	2007-05-24 15:56:25.000000000 -0700
@@ -33,6 +33,7 @@
 #include <linux/mm.h>
 #include <asm/tracehook.h>
 #include <asm/errno.h>
+#include <asm/mman.h>
 
 #define SET_ENGINE_FLAGS	1
 #define CLEAR_ENGINE_FLAGS	0
@@ -341,6 +342,7 @@ static int quiesce_all_threads(struct up
 static void uprobe_free_process(struct uprobe_process *uproc)
 {
 	struct uprobe_task *utask, *tmp;
+	struct uprobe_ssol_area *area = &uproc->ssol_area;
 
 	if (!hlist_unhashed(&uproc->hlist))
 		hlist_del(&uproc->hlist);
@@ -354,6 +356,8 @@ static void uprobe_free_process(struct u
 			utrace_detach(utask->tsk, utask->engine);
 		kfree(utask);
 	}
+	if (area->slots)
+		kfree(area->slots);
 	up_write(&uproc->rwsem);	// So kfree doesn't complain
 	kfree(uproc);
 }
@@ -496,6 +500,14 @@ static struct uprobe_process *uprobe_mk_
 	INIT_HLIST_NODE(&uproc->hlist);
 	uproc->tgid = p->tgid;
 
+	uproc->ssol_area.insn_area = NULL;
+	mutex_init(&uproc->ssol_area.setup_mutex);
+#ifdef CONFIG_UPROBES_SSOL
+	uproc->sstep_out_of_line = 1;
+#else
+	uproc->sstep_out_of_line = 0;
+#endif
+
 	/*
 	 * Create and populate one utask per thread in this process.  We
 	 * can't call uprobe_add_task() while holding tasklist_lock, so we:
@@ -545,6 +557,8 @@ static struct uprobe_probept *uprobe_add
 		return ERR_PTR(-ENOMEM);
 	init_waitqueue_head(&ppt->waitq);
 	mutex_init(&ppt->ssil_mutex);
+	mutex_init(&ppt->slot_mutex);
+	ppt->slot = NULL;
 
 	/* Connect to uk. */
 	INIT_LIST_HEAD(&ppt->uprobe_list);
@@ -566,6 +580,25 @@ static struct uprobe_probept *uprobe_add
 	return ppt;
 }
 
+/* ppt is going away.  Free its slot (if it owns one) in the SSOL area. */
+static void uprobe_free_slot(struct uprobe_probept *ppt)
+{
+	struct uprobe_ssol_slot *slot = ppt->slot;
+	if (slot) {
+		down_write(&slot->rwsem);
+		if (slot->owner == ppt) {
+			unsigned long flags;
+			struct uprobe_ssol_area *area = &ppt->uproc->ssol_area;
+			spin_lock_irqsave(&area->lock, flags);
+			slot->state = SSOL_FREE;
+			slot->owner = NULL;
+			area->nfree++;
+			spin_unlock_irqrestore(&area->lock, flags);
+		}
+		up_write(&slot->rwsem);
+	}
+}
+
 /*
  * Runs with ppt->uproc write-locked.  Frees ppt and decrements the ref count
  * on ppt->uproc (but ref count shouldn't hit 0).
@@ -573,6 +606,7 @@ static struct uprobe_probept *uprobe_add
 static void uprobe_free_probept(struct uprobe_probept *ppt)
 {
 	struct uprobe_process *uproc = ppt->uproc;
+	uprobe_free_slot(ppt);
 	hlist_del(&ppt->ut_node);
 	uproc->nppt--;
 	kfree(ppt);
@@ -598,7 +632,7 @@ static void purge_uprobe(struct uprobe_k
 		uprobe_free_probept(ppt);
 }
 
-/* Probed address must be in an executable VM area. */
+/* Probed address must be in an executable VM area, outside the SSOL area. */
 static int uprobe_validate_vaddr(struct task_struct *p, unsigned long vaddr)
 {
 	struct vm_area_struct *vma;
@@ -607,7 +641,8 @@ static int uprobe_validate_vaddr(struct 
 		return -EINVAL;
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, vaddr);
-	if (!vma || vaddr < vma->vm_start || !(vma->vm_flags & VM_EXEC)) {
+	if (!vma || vaddr < vma->vm_start || !(vma->vm_flags & VM_EXEC)
+	    || vma->vm_start == (unsigned long) mm->context.uprobes_ssol_area) {
 		up_read(&mm->mmap_sem);
 		return -EINVAL;
 	}
@@ -840,6 +875,256 @@ done:
 }
 
 /*
+ * Functions for allocation of the SSOL area, and the instruction slots
+ * therein
+ */
+
+/*
+ * Mmap a page for the uprobes SSOL area for the current process.
+ * Returns with mm->context.uprobes_ssol_area pointing at the page,
+ * or set to a negative errno.
+ * This approach was suggested by Roland McGrath.
+ */
+static void uprobe_setup_ssol_vma(void)
+{
+	unsigned long addr;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	down_write(&mm->mmap_sem);
+	/*
+	 * Find the end of the top mapping and skip a page.
+	 * If there is no space for PAGE_SIZE above
+	 * that, mmap will ignore our address hint.
+	 */
+	vma = rb_entry(rb_last(&mm->mm_rb), struct vm_area_struct, vm_rb);
+	addr = vma->vm_end + PAGE_SIZE;
+	addr = do_mmap_pgoff(NULL, addr, PAGE_SIZE, PROT_EXEC,
+					MAP_PRIVATE|MAP_ANONYMOUS, 0);
+	if (addr & ~PAGE_MASK) {
+		up_write(&mm->mmap_sem);
+		mm->context.uprobes_ssol_area = ERR_PTR(addr);
+		printk(KERN_ERR "Uprobes failed to allocate a vma for"
+			" pid/tgid %d/%d for single-stepping out of line.\n",
+			current->pid, current->tgid);
+		return;
+	}
+
+	vma = find_vma(mm, addr);
+	BUG_ON(!vma);
+	/* avoid vma copy on fork() and don't expand when mremap() */
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+	up_write(&mm->mmap_sem);
+	mm->context.uprobes_ssol_area = (void *)addr;
+}
+
+/*
+ * Initialize per-process area for single stepping out-of-line.
+ * Must be run by a thread in the probed process.  Returns with
+ * area->insn_area pointing to the initialized area, or set to a
+ * negative errno.
+ */
+static void uprobe_init_ssol(struct uprobe_ssol_area *area)
+{
+	struct uprobe_ssol_slot *slot;
+	int i;
+	char *slot_addr;	// Simplify pointer arithmetic
+
+	 /*
+	  * If we previously probed this process and then removed all
+	  * probes, the vma is still available to us.
+	  */
+	if (!current->mm->context.uprobes_ssol_area)
+		uprobe_setup_ssol_vma();
+	area->insn_area = (uprobe_opcode_t *)
+			 current->mm->context.uprobes_ssol_area;
+	if (IS_ERR(area->insn_area))
+		return;
+
+	area->slots = (struct uprobe_ssol_slot *)
+		kzalloc(sizeof(struct uprobe_ssol_slot) *
+					UINSNS_PER_PAGE, GFP_USER);
+	if (!area->slots) {
+		area->insn_area = ERR_PTR(-ENOMEM);
+		return;
+	}
+	area->nfree = area->nslots = UINSNS_PER_PAGE;
+	spin_lock_init(&area->lock);
+	area->next_slot = 0;
+	slot_addr = (char*) area->insn_area;
+	for (i = 0; i < UINSNS_PER_PAGE; i++) {
+		slot = &area->slots[i];
+		init_rwsem(&slot->rwsem);
+		slot->state = SSOL_FREE;
+		slot->owner = NULL;
+		slot->last_used = 0;
+		slot->insn = (__user uprobe_opcode_t *) slot_addr;
+		slot_addr += MAX_UINSN_BYTES;
+	}
+}
+
+/*
+ * Verify that the SSOL area has been set up for uproc.  Returns a
+ * pointer to the SSOL area, or a negative erro if we couldn't set it up.
+ */
+static __user uprobe_opcode_t
+			*uprobe_verify_ssol(struct uprobe_process *uproc)
+{
+	struct uprobe_ssol_area *area = &uproc->ssol_area;
+
+	if (unlikely(!area->insn_area)) {
+		/* First time through for this probed process */
+		static DEFINE_MUTEX(ssol_setup_mutex);
+		mutex_lock(&ssol_setup_mutex);
+		if (likely(!area->insn_area))
+			/* Nobody snuck in and set things up ahead of us. */
+			uprobe_init_ssol(area);
+		mutex_unlock(&ssol_setup_mutex);
+	}
+	return area->insn_area;
+}
+
+static inline int advance_slot(int slot, struct uprobe_ssol_area *area)
+{
+	return (slot + 1) % area->nslots;
+}
+
+/*
+ * Return the slot number of the least-recently-used slot in the
+ * neighborhood of area->next_slot.  Limit the number of slots we test
+ * to keep it fast.  Nobody dies if this isn't the best choice.
+ */
+static int uprobe_lru_insn_slot(struct uprobe_ssol_area *area)
+{
+#define MAX_LRU_TESTS 10
+	struct uprobe_ssol_slot *s;
+	int lru_slot = -1;
+	unsigned long lru_time = ULONG_MAX;
+	int nr_lru_tests = 0;
+	int slot = area->next_slot;
+	do {
+		s = &area->slots[slot];
+		if (likely(s->state == SSOL_ASSIGNED)) {
+			if( lru_time > s->last_used) {
+				lru_time = s->last_used;
+				lru_slot = slot;
+			}
+			if (++nr_lru_tests >= MAX_LRU_TESTS)
+				break;
+		}
+		slot = advance_slot(slot, area);
+	} while (slot != area->next_slot);
+
+	if (unlikely(lru_slot < 0))
+		/* All slots are in the act of being stolen.  Join the melee. */
+		return area->next_slot;
+	else
+		return lru_slot;
+}
+
+/*
+ * Choose an instruction slot and take it.  Choose a free slot if there is one.
+ * Otherwise choose the least-recently-used slot.  Returns with slot
+ * read-locked and containing the desired instruction.  Runs with
+ * ppt->slot_mutex locked.
+ */
+static struct uprobe_ssol_slot
+		*uprobe_take_insn_slot(struct uprobe_probept *ppt)
+{
+	struct uprobe_process *uproc = ppt->uproc;
+	struct uprobe_ssol_area *area = &uproc->ssol_area;
+	struct uprobe_ssol_slot *s;
+	int len, slot;
+	unsigned long flags;
+
+	spin_lock_irqsave(&area->lock, flags);
+
+	if (area->nfree) {
+		for (slot = 0; slot < area->nslots; slot++) {
+			if (area->slots[slot].state == SSOL_FREE) {
+				area->nfree--;
+				goto found_slot;
+			}
+		}
+		/* Shouldn't get here.  Fix nfree and get on with life. */
+		area->nfree = 0;
+	}
+	slot = uprobe_lru_insn_slot(area);
+
+found_slot:
+	area->next_slot = advance_slot(slot, area);
+	s = &area->slots[slot];
+	s->state = SSOL_BEING_STOLEN;
+
+	spin_unlock_irqrestore(&area->lock, flags);
+
+	/* Wait for current users of slot to finish. */
+	down_write(&s->rwsem);
+	ppt->slot = s;
+	s->owner = ppt;
+	s->last_used = jiffies;
+	s->state = SSOL_ASSIGNED;
+	/* Copy the original instruction to the chosen slot. */
+	len = access_process_vm(current, (unsigned long)s->insn,
+					 ppt->insn, MAX_UINSN_BYTES, 1);
+        if (unlikely(len < MAX_UINSN_BYTES)) {
+		up_write(&s->rwsem);
+		printk(KERN_ERR "Failed to copy instruction at %#lx"
+			" to SSOL area (%#lx)\n", ppt->vaddr,
+			(unsigned long) area->slots);
+		return NULL;
+	}
+	/* Let other threads single-step in this slot. */
+	downgrade_write(&s->rwsem);
+	return s;
+}
+
+/* ppt doesn't own a slot.  Get one for ppt, and return it read-locked. */
+static struct uprobe_ssol_slot
+		*uprobe_find_insn_slot(struct uprobe_probept *ppt)
+{
+	struct uprobe_ssol_slot *slot;
+
+	mutex_lock(&ppt->slot_mutex);
+	slot = ppt->slot;
+	if (unlikely(slot && slot->owner == ppt)) {
+		/* Looks like another thread snuck in and got a slot for us. */
+		down_read(&slot->rwsem);
+		if (likely(slot->owner == ppt)) {
+			slot->last_used = jiffies;
+			mutex_unlock(&ppt->slot_mutex);
+			return slot;
+		}
+		/* ... but then somebody stole it. */
+		up_read(&slot->rwsem);
+	}
+	slot = uprobe_take_insn_slot(ppt);
+	mutex_unlock(&ppt->slot_mutex);
+	return slot;
+}
+
+/*
+ * Ensure that ppt owns an instruction slot for single-stepping.
+ * Returns with the slot read-locked and ppt->slot pointing at it.
+ */
+struct uprobe_ssol_slot *uprobe_get_insn_slot(struct uprobe_probept *ppt)
+{
+	struct uprobe_ssol_slot *slot = ppt->slot;
+
+	if (unlikely(!slot))
+		return uprobe_find_insn_slot(ppt);
+
+	down_read(&slot->rwsem);
+	if (unlikely(slot->owner != ppt)) {
+		up_read(&slot->rwsem);
+		return uprobe_find_insn_slot(ppt);
+	}
+	slot->last_used = jiffies;
+	return slot;
+}
+
+/*
  * utrace engine report callbacks
  */
 
@@ -939,6 +1224,8 @@ static inline void uprobe_post_ssin(stru
 	mutex_unlock(&ppt->ssil_mutex);
 }
 
+/* uprobe_pre_ssout() and uprobe_post_ssout() are architecture-specific. */
+
 /*
  * Signal callback:
  *
@@ -982,7 +1269,19 @@ static u32 uprobe_report_signal(struct u
 	if (action != UTRACE_SIGNAL_CORE || info->si_signo != SIGTRAP)
 		goto no_interest;
 
-	uproc = utask->uproc;
+	/*
+	 * Set up the SSOL area if it's not already there.  We do this
+	 * here because we have to do it before handling the first
+	 * probepoint hit, the probed process has to do it, and this may
+	 * be the first time our probed process runs uprobes code.
+	 */
+ 	uproc = utask->uproc;
+#ifdef CONFIG_UPROBES_SSOL
+	if (uproc->sstep_out_of_line &&
+			unlikely(IS_ERR(uprobe_verify_ssol(uproc))))
+		uproc->sstep_out_of_line = 0;
+#endif
+
 	switch (utask->state) {
 	case UPTASK_RUNNING:
 		down_read(&uproc->rwsem);
@@ -1005,7 +1304,12 @@ static u32 uprobe_report_signal(struct u
 		}
 
 		utask->state = UPTASK_PRE_SSTEP;
-		uprobe_pre_ssin(utask, ppt, regs);
+#ifdef CONFIG_UPROBES_SSOL
+		if (uproc->sstep_out_of_line)
+			uprobe_pre_ssout(utask, ppt, regs);
+		else
+#endif
+			uprobe_pre_ssin(utask, ppt, regs);
 		if (unlikely(utask->doomed))
 			do_exit(SIGSEGV);
 		utask->state = UPTASK_SSTEP;
@@ -1020,7 +1324,12 @@ static u32 uprobe_report_signal(struct u
 		ppt = utask->active_probe;
 		BUG_ON(!ppt);
 		utask->state = UPTASK_POST_SSTEP;
-		uprobe_post_ssin(utask, ppt);
+#ifdef CONFIG_UPROBES_SSOL
+		if (uproc->sstep_out_of_line)
+			uprobe_post_ssout(utask, ppt, regs);
+		else
+#endif
+			uprobe_post_ssin(utask, ppt);
 		if (unlikely(utask->doomed))
 			do_exit(SIGSEGV);
 
@@ -1200,14 +1509,21 @@ static u32 uprobe_report_exit(struct utr
 		printk(KERN_WARNING "Task died at uprobe probepoint:"
 			"  pid/tgid = %d/%d, probepoint = %#lx\n",
 			tsk->pid, tsk->tgid, ppt->vaddr);
-		switch (utask->state) {
-		case UPTASK_PRE_SSTEP:
-		case UPTASK_SSTEP:
-		case UPTASK_POST_SSTEP:
-			mutex_unlock(&ppt->ssil_mutex);
-			break;
-		default:
-			break;
+		/* Mutex cleanup depends on where we died and SSOL vs. SSIL. */
+		if (uproc->sstep_out_of_line) {
+			if (utask->state == UPTASK_SSTEP
+					&& ppt->slot && ppt->slot->owner == ppt)
+				up_read(&ppt->slot->rwsem);
+		} else {
+			switch (utask->state) {
+			case UPTASK_PRE_SSTEP:
+			case UPTASK_SSTEP:
+			case UPTASK_POST_SSTEP:
+				mutex_unlock(&ppt->ssil_mutex);
+				break;
+			default:
+				break;
+			}
 		}
 		up_read(&uproc->rwsem);
 	}
_