From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <libc-ports-return-4224-listarch-libc-ports=sources.redhat.com@sourceware.org>
Received: (qmail 13374 invoked by alias); 18 Jun 2013 22:01:55 -0000
Mailing-List: contact libc-ports-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-ports.sourceware.org>
List-Subscribe: <mailto:libc-ports-subscribe@sourceware.org>
List-Post: <mailto:libc-ports@sourceware.org>
List-Help: <mailto:libc-ports-help@sourceware.org>, <http://sourceware.org/lists.html#faqs>
Sender: libc-ports-owner@sourceware.org
Received: (qmail 13358 invoked by uid 89); 18 Jun 2013 22:01:54 -0000
X-Spam-SWARE-Status: No, score=-2.2 required=5.0 tests=AWL,BAYES_00,TW_CP,TW_DR,TW_VF autolearn=no version=3.3.1
Received: from toast.topped-with-meat.com (HELO topped-with-meat.com) (204.197.218.159)    by sourceware.org (qpsmtpd/0.84/v0.84-167-ge50287c) with ESMTP; Tue, 18 Jun 2013 22:01:51 +0000
Received: by topped-with-meat.com (Postfix, from userid 5281)	id CB94E2C0E4; Tue, 18 Jun 2013 15:01:49 -0700 (PDT)
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
From: Roland McGrath <roland@hack.frob.com>
To: libc-ports@sourceware.org
Subject: [PATCH roland/arm-memcpy] ARM: Make armv7 memcpy implementations SFI-friendly
Message-Id: <20130618220149.CB94E2C0E4@topped-with-meat.com>
Date: Tue, 18 Jun 2013 22:01:00 -0000
X-CMAE-Score: 0
X-CMAE-Analysis: v=2.1 cv=LYSvtFvi c=1 sm=1 tr=0		a=WkljmVdYkabdwxfqvArNOQ==:117 a=14OXPxybAAAA:8 a=It2na7s1aUoA:10		a=Z6MIti7PxpgA:10 a=kj9zAlcOel0A:10 a=hOe2yjtxAAAA:8 a=KvO54G7YVhEA:10		a=WpSSpNb_cn2U4MRpvXQA:9 a=KgRUU3Z-JOTzIwHZ:21 a=m8YMD0NPWHbtZIsk:21		a=CjuIK1q_8ugA:10
X-SW-Source: 2013-06/txt/msg00035.txt.bz2

This makes the new memcpy implementation(s) usable for arm-nacl.

I've tested on armv7l-linux-gnueabihf that 'make check subdirs=string'
has no errors after this change.  I've further verified that the only
differences in the compiled code are for the register allocation
changes (so not even address offsets change).  I've tested the
actually-new code by locally hacking arm-features.h to define
ARM_ALWAYS_BX to 1 and ARM_BX_ALIGN_LOG2 to 4; that build has still no
errors in 'make check subdirs=string'.  (I've also verified that the
arm-nacl build of this code passes the NaCl validator.)

Since this is so non-perturbing, I hope I can commit it before the
impending freeze.  It would not be a terrible imposition if it had to
wait, but somewhat inconvenient for me.


Thanks,
Roland


ports/ChangeLog.arm
	* sysdeps/arm/arm-features.h (ARM_BX_NINSNS): New macro.
	* sysdeps/arm/armv7/multiarch/memcpy_impl.S: Macroize the
	computed-jump dispatch sections.  Use sfi_breg throughout.
	[ARM_ALWAYS_BX]: Define a different version of the dispatch macros
	that uses bx rather than add-to-pc, and respects ARM_BX_ALIGN_LOG2.
	[!USE_NEON] (D_l, D_h): Use r10, r11 rather than r8, r9.
	(tmp2): Use r8 rather than r10.

--- a/ports/sysdeps/arm/arm-features.h
+++ b/ports/sysdeps/arm/arm-features.h
@@ -53,6 +53,14 @@
 # define ARM_BX_ALIGN_LOG2	2
 #endif
 
+/* The number of instructions that 'bx' expands to.  A more-specific
+   arm-features.h that defines 'bx' as a macro should define this to the
+   number instructions it expands to.  This is used only in a context
+   where the 'bx' expansion won't cross an ARM_BX_ALIGN_LOG2 boundary.  */
+#ifndef ARM_BX_NINSNS
+# define ARM_BX_NINSNS		1
+#endif
+
 /* An OS-specific arm-features.h file may define ARM_NO_INDEX_REGISTER to
    indicate that the two-register addressing modes must never be used.  */
 
--- a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
@@ -33,6 +33,7 @@
 #define NO_THUMB
 #endif
 #include <sysdep.h>
+#include <arm-features.h>
 
 	.syntax unified
 	/* This implementation requires ARM state.  */
@@ -71,7 +72,139 @@
 /* Locals.  */
 #define tmp1	r3
 #define dst	ip
-#define tmp2	r10
+#define tmp2	r8
+
+/* These two macros both work by repeated invocation of the macro
+   dispatch_step (not defined here).  That macro performs one "step",
+   doing one load instruction and one store instruction to copy one
+   "unit".  On entry, TMP1 contains the number of bytes to be copied,
+   a multiple of the unit size.  The macro clobbers TMP1 in the
+   process of doing a computed jump to the tail containing the
+   appropriate number of steps.
+
+   In dispatch_7_dword, dispatch_step is invoked seven times, with an
+   argument that is 7 for the first and 1 for the last.  Units are
+   double-words (8 bytes).  TMP1 is at most 56.
+
+   In dispatch_15_word, dispatch_step is invoked fifteen times,
+   with an argument that is 15 for the first and 1 for the last.
+   Units are words (4 bytes).  TMP1 is at most 60.  */
+
+#ifndef ARM_ALWAYS_BX
+# if ARM_BX_ALIGN_LOG2 != 2
+#  error case not handled
+# endif
+	.macro dispatch_7_dword
+	rsb	tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
+	add	pc, pc, tmp1
+	dispatch_step 7
+	dispatch_step 6
+	dispatch_step 5
+	dispatch_step 4
+	dispatch_step 3
+	dispatch_step 2
+	dispatch_step 1
+	.purgem dispatch_step
+	.endm
+
+	.macro dispatch_15_word
+	rsb	tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
+	add	pc, pc, tmp1, lsl #1
+	dispatch_step 15
+	dispatch_step 14
+	dispatch_step 13
+	dispatch_step 12
+	dispatch_step 11
+	dispatch_step 10
+	dispatch_step 9
+	dispatch_step 8
+	dispatch_step 7
+	dispatch_step 6
+	dispatch_step 5
+	dispatch_step 4
+	dispatch_step 3
+	dispatch_step 2
+	dispatch_step 1
+	.purgem dispatch_step
+	.endm
+#else
+# if ARM_BX_ALIGN_LOG2 < 4
+#  error case not handled
+# endif
+	.macro dispatch_helper steps, log2_bytes_per_step
+	.p2align ARM_BX_ALIGN_LOG2
+	/* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
+	   (STEPS << LOG2_BYTES_PER_STEP).
+	   So this is (steps_to_skip << LOG2_BYTES_PER_STEP).  */
+	rsb	tmp1, tmp1, #(\steps << \log2_bytes_per_step)
+	/* Pad so that the add;bx pair immediately precedes an alignment
+	   boundary.  Hence, TMP1=0 will run all the steps.  */
+	.rept (1 << (ARM_BX_ALIGN_LOG2 - 2)) - (2 + ARM_BX_NINSNS)
+	nop
+	.endr
+	/* Shifting down LOG2_BYTES_PER_STEP gives us the number of
+	   steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
+	   the (byte) distance to add to the PC.  */
+	add	tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
+	bx	tmp1
+	.endm
+
+	.macro dispatch_7_dword
+	dispatch_helper 7, 3
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 7
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 6
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 5
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 4
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 3
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 2
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 1
+	.p2align ARM_BX_ALIGN_LOG2
+	.purgem dispatch_step
+	.endm
+
+	.macro dispatch_15_word
+	dispatch_helper 15, 2
+	dispatch_step 15
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 14
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 13
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 12
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 11
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 10
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 9
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 8
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 7
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 6
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 5
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 4
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 3
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 2
+	.p2align ARM_BX_ALIGN_LOG2
+	dispatch_step 1
+	.p2align ARM_BX_ALIGN_LOG2
+	.purgem dispatch_step
+	.endm
+
+#endif
 
 #ifndef USE_NEON
 /* For bulk copies using GP registers.  */
@@ -81,8 +214,9 @@
 #define	B_h	r5
 #define	C_l	r6
 #define	C_h	r7
-#define	D_l	r8
-#define	D_h	r9
+/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved.  */
+#define	D_l	r10
+#define	D_h	r11
 #endif
 
 /* Number of lines ahead to pre-fetch data.  If you change this the code
@@ -92,40 +226,71 @@
 
 #ifdef USE_VFP
 	.macro	cpy_line_vfp vreg, base
-	vstr	\vreg, [dst, #\base]
-	vldr	\vreg, [src, #\base]
-	vstr	d0, [dst, #\base + 8]
-	vldr	d0, [src, #\base + 8]
-	vstr	d1, [dst, #\base + 16]
-	vldr	d1, [src, #\base + 16]
-	vstr	d2, [dst, #\base + 24]
-	vldr	d2, [src, #\base + 24]
-	vstr	\vreg, [dst, #\base + 32]
-	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
-	vstr	d0, [dst, #\base + 40]
-	vldr	d0, [src, #\base + 40]
-	vstr	d1, [dst, #\base + 48]
-	vldr	d1, [src, #\base + 48]
-	vstr	d2, [dst, #\base + 56]
-	vldr	d2, [src, #\base + 56]
+	sfi_breg dst, \
+	vstr	\vreg, [\B, #\base]
+	sfi_breg src, \
+	vldr	\vreg, [\B, #\base]
+	sfi_breg dst, \
+	vstr	d0, [\B, #\base + 8]
+	sfi_breg src, \
+	vldr	d0, [\B, #\base + 8]
+	sfi_breg dst, \
+	vstr	d1, [\B, #\base + 16]
+	sfi_breg src, \
+	vldr	d1, [\B, #\base + 16]
+	sfi_breg dst, \
+	vstr	d2, [\B, #\base + 24]
+	sfi_breg src, \
+	vldr	d2, [\B, #\base + 24]
+	sfi_breg dst, \
+	vstr	\vreg, [\B, #\base + 32]
+	sfi_breg src, \
+	vldr	\vreg, [\B, #\base + prefetch_lines * 64 - 32]
+	sfi_breg dst, \
+	vstr	d0, [\B, #\base + 40]
+	sfi_breg src, \
+	vldr	d0, [\B, #\base + 40]
+	sfi_breg dst, \
+	vstr	d1, [\B, #\base + 48]
+	sfi_breg src, \
+	vldr	d1, [\B, #\base + 48]
+	sfi_breg dst, \
+	vstr	d2, [\B, #\base + 56]
+	sfi_breg src, \
+	vldr	d2, [\B, #\base + 56]
 	.endm
 
 	.macro	cpy_tail_vfp vreg, base
-	vstr	\vreg, [dst, #\base]
-	vldr	\vreg, [src, #\base]
-	vstr	d0, [dst, #\base + 8]
-	vldr	d0, [src, #\base + 8]
-	vstr	d1, [dst, #\base + 16]
-	vldr	d1, [src, #\base + 16]
-	vstr	d2, [dst, #\base + 24]
-	vldr	d2, [src, #\base + 24]
-	vstr	\vreg, [dst, #\base + 32]
-	vstr	d0, [dst, #\base + 40]
-	vldr	d0, [src, #\base + 40]
-	vstr	d1, [dst, #\base + 48]
-	vldr	d1, [src, #\base + 48]
-	vstr	d2, [dst, #\base + 56]
-	vldr	d2, [src, #\base + 56]
+	sfi_breg dst, \
+	vstr	\vreg, [\B, #\base]
+	sfi_breg src, \
+	vldr	\vreg, [\B, #\base]
+	sfi_breg dst, \
+	vstr	d0, [\B, #\base + 8]
+	sfi_breg src, \
+	vldr	d0, [\B, #\base + 8]
+	sfi_breg dst, \
+	vstr	d1, [\B, #\base + 16]
+	sfi_breg src, \
+	vldr	d1, [\B, #\base + 16]
+	sfi_breg dst, \
+	vstr	d2, [\B, #\base + 24]
+	sfi_breg src, \
+	vldr	d2, [\B, #\base + 24]
+	sfi_breg dst, \
+	vstr	\vreg, [\B, #\base + 32]
+	sfi_breg dst, \
+	vstr	d0, [\B, #\base + 40]
+	sfi_breg src, \
+	vldr	d0, [\B, #\base + 40]
+	sfi_breg dst, \
+	vstr	d1, [\B, #\base + 48]
+	sfi_breg src, \
+	vldr	d1, [\B, #\base + 48]
+	sfi_breg dst, \
+	vstr	d2, [\B, #\base + 56]
+	sfi_breg src, \
+	vldr	d2, [\B, #\base + 56]
 	.endm
 #endif
 
@@ -140,81 +305,61 @@ ENTRY(memcpy)
 
 .Ltail63unaligned:
 #ifdef USE_NEON
+	/* These need an extra layer of macro just to work around a
+	   bug in the assembler's parser when an operand starts with
+	   a {...}.  */
+	.macro neon_load_d0 reg
+	vld1.8	{d0}, [\reg]!
+	.endm
+	.macro neon_store_d0 reg
+	vst1.8	{d0}, [\reg]!
+	.endm
+
+	/* These are used by the NaCl sfi_breg macro.  */
+	.macro _sfi_breg_dmask_neon_load_d0 reg
+	_sfi_dmask \reg
+	.endm
+	.macro _sfi_breg_dmask_neon_store_d0 reg
+	_sfi_dmask \reg
+	.endm
+
 	and	tmp1, count, #0x38
-	rsb	tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
-	add	pc, pc, tmp1
-	vld1.8	{d0}, [src]!	/* 14 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 12 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 10 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 8 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 6 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 4 words to go.  */
-	vst1.8	{d0}, [dst]!
-	vld1.8	{d0}, [src]!	/* 2 words to go.  */
-	vst1.8	{d0}, [dst]!
+	.macro dispatch_step i
+	sfi_breg src, neon_load_d0 \B
+	sfi_breg dst, neon_store_d0 \B
+	.endm
+	dispatch_7_dword
 
 	tst	count, #4
-	ldrne	tmp1, [src], #4
-	strne	tmp1, [dst], #4
+	sfi_breg src, \
+	ldrne	tmp1, [\B], #4
+	sfi_breg dst, \
+	strne	tmp1, [\B], #4
 #else
 	/* Copy up to 15 full words of data.  May not be aligned.  */
 	/* Cannot use VFP for unaligned data.  */
 	and	tmp1, count, #0x3c
 	add	dst, dst, tmp1
 	add	src, src, tmp1
-	rsb	tmp1, tmp1, #(60 - PC_OFS/2 + INSN_SIZE/2)
 	/* Jump directly into the sequence below at the correct offset.  */
-	add	pc, pc, tmp1, lsl #1
-
-	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
-	str	tmp1, [dst, #-60]
-
-	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
-	str	tmp1, [dst, #-56]
-	ldr	tmp1, [src, #-52]
-	str	tmp1, [dst, #-52]
-
-	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
-	str	tmp1, [dst, #-48]
-	ldr	tmp1, [src, #-44]
-	str	tmp1, [dst, #-44]
-
-	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
-	str	tmp1, [dst, #-40]
-	ldr	tmp1, [src, #-36]
-	str	tmp1, [dst, #-36]
-
-	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
-	str	tmp1, [dst, #-32]
-	ldr	tmp1, [src, #-28]
-	str	tmp1, [dst, #-28]
-
-	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
-	str	tmp1, [dst, #-24]
-	ldr	tmp1, [src, #-20]
-	str	tmp1, [dst, #-20]
-
-	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
-	str	tmp1, [dst, #-16]
-	ldr	tmp1, [src, #-12]
-	str	tmp1, [dst, #-12]
-
-	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
-	str	tmp1, [dst, #-8]
-	ldr	tmp1, [src, #-4]
-	str	tmp1, [dst, #-4]
+	.macro dispatch_step i
+	sfi_breg src, \
+	ldr	tmp1, [\B, #-(\i * 4)]
+	sfi_breg dst, \
+	str	tmp1, [\B, #-(\i * 4)]
+	.endm
+	dispatch_15_word
 #endif
 
 	lsls	count, count, #31
-	ldrhcs	tmp1, [src], #2
-	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
-	strhcs	tmp1, [dst], #2
-	strbne	src, [dst]
+	sfi_breg src, \
+	ldrhcs	tmp1, [\B], #2
+	sfi_breg src, \
+	ldrbne	src, [\B]		/* Src is dead, use as a scratch.  */
+	sfi_breg dst, \
+	strhcs	tmp1, [\B], #2
+	sfi_breg dst, \
+	strbne	src, [\B]
 	bx	lr
 
 .Lcpy_not_short:
@@ -242,13 +387,19 @@ ENTRY(memcpy)
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
-	ldrmi	tmp1, [src], #4
-	strmi	tmp1, [dst], #4
+	sfi_breg src, \
+	ldrmi	tmp1, [\B], #4
+	sfi_breg dst, \
+	strmi	tmp1, [\B], #4
 	lsls	tmp2, tmp2, #2
-	ldrhcs	tmp1, [src], #2
-	ldrbne	tmp2, [src], #1
-	strhcs	tmp1, [dst], #2
-	strbne	tmp2, [dst], #1
+	sfi_breg src, \
+	ldrhcs	tmp1, [\B], #2
+	sfi_breg src, \
+	ldrbne	tmp2, [\B], #1
+	sfi_breg dst, \
+	strhcs	tmp1, [\B], #2
+	sfi_breg dst, \
+	strbne	tmp2, [\B], #1
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
@@ -260,24 +411,40 @@ ENTRY(memcpy)
 .Lcpy_body_medium:			/* Count in tmp2.  */
 #ifdef USE_VFP
 1:
-	vldr	d0, [src, #0]
+	sfi_breg src, \
+	vldr	d0, [\B, #0]
 	subs	tmp2, tmp2, #64
-	vldr	d1, [src, #8]
-	vstr	d0, [dst, #0]
-	vldr	d0, [src, #16]
-	vstr	d1, [dst, #8]
-	vldr	d1, [src, #24]
-	vstr	d0, [dst, #16]
-	vldr	d0, [src, #32]
-	vstr	d1, [dst, #24]
-	vldr	d1, [src, #40]
-	vstr	d0, [dst, #32]
-	vldr	d0, [src, #48]
-	vstr	d1, [dst, #40]
-	vldr	d1, [src, #56]
-	vstr	d0, [dst, #48]
+	sfi_breg src, \
+	vldr	d1, [\B, #8]
+	sfi_breg dst, \
+	vstr	d0, [\B, #0]
+	sfi_breg src, \
+	vldr	d0, [\B, #16]
+	sfi_breg dst, \
+	vstr	d1, [\B, #8]
+	sfi_breg src, \
+	vldr	d1, [\B, #24]
+	sfi_breg dst, \
+	vstr	d0, [\B, #16]
+	sfi_breg src, \
+	vldr	d0, [\B, #32]
+	sfi_breg dst, \
+	vstr	d1, [\B, #24]
+	sfi_breg src, \
+	vldr	d1, [\B, #40]
+	sfi_breg dst, \
+	vstr	d0, [\B, #32]
+	sfi_breg src, \
+	vldr	d0, [\B, #48]
+	sfi_breg dst, \
+	vstr	d1, [\B, #40]
+	sfi_breg src, \
+	vldr	d1, [\B, #56]
+	sfi_breg dst, \
+	vstr	d0, [\B, #48]
 	add	src, src, #64
-	vstr	d1, [dst, #56]
+	sfi_breg dst, \
+	vstr	d1, [\B, #56]
 	add	dst, dst, #64
 	bge	1b
 	tst	tmp2, #0x3f
@@ -287,43 +454,49 @@ ENTRY(memcpy)
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
-	rsb	tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
-	add	pc, pc, tmp1
-
-	vldr	d0, [src, #-56]	/* 14 words to go.  */
-	vstr	d0, [dst, #-56]
-	vldr	d0, [src, #-48]	/* 12 words to go.  */
-	vstr	d0, [dst, #-48]
-	vldr	d0, [src, #-40]	/* 10 words to go.  */
-	vstr	d0, [dst, #-40]
-	vldr	d0, [src, #-32]	/* 8 words to go.  */
-	vstr	d0, [dst, #-32]
-	vldr	d0, [src, #-24]	/* 6 words to go.  */
-	vstr	d0, [dst, #-24]
-	vldr	d0, [src, #-16]	/* 4 words to go.  */
-	vstr	d0, [dst, #-16]
-	vldr	d0, [src, #-8]	/* 2 words to go.  */
-	vstr	d0, [dst, #-8]
+	.macro dispatch_step i
+	sfi_breg src, \
+	vldr	d0, [\B, #-(\i * 8)]
+	sfi_breg dst, \
+	vstr	d0, [\B, #-(\i * 8)]
+	.endm
+	dispatch_7_dword
 #else
 	sub	src, src, #8
 	sub	dst, dst, #8
 1:
-	ldrd	A_l, A_h, [src, #8]
-	strd	A_l, A_h, [dst, #8]
-	ldrd	A_l, A_h, [src, #16]
-	strd	A_l, A_h, [dst, #16]
-	ldrd	A_l, A_h, [src, #24]
-	strd	A_l, A_h, [dst, #24]
-	ldrd	A_l, A_h, [src, #32]
-	strd	A_l, A_h, [dst, #32]
-	ldrd	A_l, A_h, [src, #40]
-	strd	A_l, A_h, [dst, #40]
-	ldrd	A_l, A_h, [src, #48]
-	strd	A_l, A_h, [dst, #48]
-	ldrd	A_l, A_h, [src, #56]
-	strd	A_l, A_h, [dst, #56]
-	ldrd	A_l, A_h, [src, #64]!
-	strd	A_l, A_h, [dst, #64]!
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #8]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #8]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #16]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #16]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #24]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #24]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #32]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #32]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #40]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #48]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #48]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #56]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #56]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #64]!
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #64]!
 	subs	tmp2, tmp2, #64
 	bge	1b
 	tst	tmp2, #0x3f
@@ -349,32 +522,29 @@ ENTRY(memcpy)
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
-	rsb	tmp1, tmp1, #(56 - PC_OFS + INSN_SIZE)
-	add	pc, pc, tmp1
-	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
-	strd	A_l, A_h, [dst, #-56]
-	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
-	strd	A_l, A_h, [dst, #-48]
-	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
-	strd	A_l, A_h, [dst, #-40]
-	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
-	strd	A_l, A_h, [dst, #-32]
-	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
-	strd	A_l, A_h, [dst, #-24]
-	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
-	strd	A_l, A_h, [dst, #-16]
-	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
-	strd	A_l, A_h, [dst, #-8]
-
+	.macro dispatch_step i
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #-(\i * 8)]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #-(\i * 8)]
+	.endm
+	dispatch_7_dword
 #endif
+
 	tst	tmp2, #4
-	ldrne	tmp1, [src], #4
-	strne	tmp1, [dst], #4
+	sfi_breg src, \
+	ldrne	tmp1, [\B], #4
+	sfi_breg dst, \
+	strne	tmp1, [\B], #4
 	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
-	ldrhcs	tmp1, [src], #2
-	ldrbne	tmp2, [src]
-	strhcs	tmp1, [dst], #2
-	strbne	tmp2, [dst]
+	sfi_breg src, \
+	ldrhcs	tmp1, [\B], #2
+	sfi_breg src, \
+	ldrbne	tmp2, [\B]
+	sfi_breg dst, \
+	strhcs	tmp1, [\B], #2
+	sfi_breg dst, \
+	strbne	tmp2, [\B]
 
 .Ldone:
 	ldr	tmp2, [sp], #FRAME_SIZE
@@ -394,15 +564,23 @@ ENTRY(memcpy)
 	   copy position into a register.  This should act like a PLD
 	   operation but we won't have to repeat the transfer.  */
 
-	vldr	d3, [src, #0]
-	vldr	d4, [src, #64]
-	vldr	d5, [src, #128]
-	vldr	d6, [src, #192]
-	vldr	d7, [src, #256]
-
-	vldr	d0, [src, #8]
-	vldr	d1, [src, #16]
-	vldr	d2, [src, #24]
+	sfi_breg src, \
+	vldr	d3, [\B, #0]
+	sfi_breg src, \
+	vldr	d4, [\B, #64]
+	sfi_breg src, \
+	vldr	d5, [\B, #128]
+	sfi_breg src, \
+	vldr	d6, [\B, #192]
+	sfi_breg src, \
+	vldr	d7, [\B, #256]
+
+	sfi_breg src, \
+	vldr	d0, [\B, #8]
+	sfi_breg src, \
+	vldr	d1, [\B, #16]
+	sfi_breg src, \
+	vldr	d2, [\B, #24]
 	add	src, src, #32
 
 	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
@@ -427,19 +605,31 @@ ENTRY(memcpy)
 	add	src, src, #3 * 64
 	add	dst, dst, #3 * 64
 	cpy_tail_vfp	d6, 0
-	vstr	d7, [dst, #64]
-	vldr	d7, [src, #64]
-	vstr	d0, [dst, #64 + 8]
-	vldr	d0, [src, #64 + 8]
-	vstr	d1, [dst, #64 + 16]
-	vldr	d1, [src, #64 + 16]
-	vstr	d2, [dst, #64 + 24]
-	vldr	d2, [src, #64 + 24]
-	vstr	d7, [dst, #64 + 32]
+	sfi_breg dst, \
+	vstr	d7, [\B, #64]
+	sfi_breg src, \
+	vldr	d7, [\B, #64]
+	sfi_breg dst, \
+	vstr	d0, [\B, #64 + 8]
+	sfi_breg src, \
+	vldr	d0, [\B, #64 + 8]
+	sfi_breg dst, \
+	vstr	d1, [\B, #64 + 16]
+	sfi_breg src, \
+	vldr	d1, [\B, #64 + 16]
+	sfi_breg dst, \
+	vstr	d2, [\B, #64 + 24]
+	sfi_breg src, \
+	vldr	d2, [\B, #64 + 24]
+	sfi_breg dst, \
+	vstr	d7, [\B, #64 + 32]
 	add	src, src, #96
-	vstr	d0, [dst, #64 + 40]
-	vstr	d1, [dst, #64 + 48]
-	vstr	d2, [dst, #64 + 56]
+	sfi_breg dst, \
+	vstr	d0, [\B, #64 + 40]
+	sfi_breg dst, \
+	vstr	d1, [\B, #64 + 48]
+	sfi_breg dst, \
+	vstr	d2, [\B, #64 + 56]
 	add	dst, dst, #128
 	add	tmp2, tmp2, #prefetch_lines * 64
 	b	.Lcpy_body_medium
@@ -450,59 +640,83 @@ ENTRY(memcpy)
 	/* Pre-bias src and dst.  */
 	sub	src, src, #8
 	sub	dst, dst, #8
-	pld	[src, #8]
-	pld	[src, #72]
+	sfi_pld	src, #8
+	sfi_pld	src, #72
 	subs	tmp2, tmp2, #64
-	pld	[src, #136]
-	ldrd	A_l, A_h, [src, #8]
+	sfi_pld	src, #136
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #8]
 	strd	B_l, B_h, [sp, #8]
 	cfi_rel_offset (B_l, 8)
 	cfi_rel_offset (B_h, 12)
-	ldrd	B_l, B_h, [src, #16]
+	sfi_breg src, \
+	ldrd	B_l, B_h, [\B, #16]
 	strd	C_l, C_h, [sp, #16]
 	cfi_rel_offset (C_l, 16)
 	cfi_rel_offset (C_h, 20)
-	ldrd	C_l, C_h, [src, #24]
+	sfi_breg src, \
+	ldrd	C_l, C_h, [\B, #24]
 	strd	D_l, D_h, [sp, #24]
 	cfi_rel_offset (D_l, 24)
 	cfi_rel_offset (D_h, 28)
-	pld	[src, #200]
-	ldrd	D_l, D_h, [src, #32]!
+	sfi_pld	src, #200
+	sfi_breg src, \
+	ldrd	D_l, D_h, [\B, #32]!
 	b	1f
 	.p2align	6
 2:
-	pld	[src, #232]
-	strd	A_l, A_h, [dst, #40]
-	ldrd	A_l, A_h, [src, #40]
-	strd	B_l, B_h, [dst, #48]
-	ldrd	B_l, B_h, [src, #48]
-	strd	C_l, C_h, [dst, #56]
-	ldrd	C_l, C_h, [src, #56]
-	strd	D_l, D_h, [dst, #64]!
-	ldrd	D_l, D_h, [src, #64]!
+	sfi_pld	src, #232
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #40]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #48]
+	sfi_breg src, \
+	ldrd	B_l, B_h, [\B, #48]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #56]
+	sfi_breg src, \
+	ldrd	C_l, C_h, [\B, #56]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #64]!
+	sfi_breg src, \
+	ldrd	D_l, D_h, [\B, #64]!
 	subs	tmp2, tmp2, #64
 1:
-	strd	A_l, A_h, [dst, #8]
-	ldrd	A_l, A_h, [src, #8]
-	strd	B_l, B_h, [dst, #16]
-	ldrd	B_l, B_h, [src, #16]
-	strd	C_l, C_h, [dst, #24]
-	ldrd	C_l, C_h, [src, #24]
-	strd	D_l, D_h, [dst, #32]
-	ldrd	D_l, D_h, [src, #32]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #8]
+	sfi_breg src, \
+	ldrd	A_l, A_h, [\B, #8]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #16]
+	sfi_breg src, \
+	ldrd	B_l, B_h, [\B, #16]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #24]
+	sfi_breg src, \
+	ldrd	C_l, C_h, [\B, #24]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #32]
+	sfi_breg src, \
+	ldrd	D_l, D_h, [\B, #32]
 	bcs	2b
 	/* Save the remaining bytes and restore the callee-saved regs.  */
-	strd	A_l, A_h, [dst, #40]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
 	add	src, src, #40
-	strd	B_l, B_h, [dst, #48]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	cfi_restore (B_l)
 	cfi_restore (B_h)
-	strd	C_l, C_h, [dst, #56]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	cfi_restore (C_l)
 	cfi_restore (C_h)
-	strd	D_l, D_h, [dst, #64]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	cfi_restore (D_l)
 	cfi_restore (D_h)
@@ -519,113 +733,173 @@ ENTRY(memcpy)
 	cfi_remember_state
 
 .Lcpy_notaligned:
-	pld	[src]
-	pld	[src, #64]
+	sfi_pld	src
+	sfi_pld	src, #64
 	/* There's at least 64 bytes to copy, but there is no mutual
 	   alignment.  */
 	/* Bring DST to 64-bit alignment.  */
 	lsls	tmp2, dst, #29
-	pld	[src, #(2 * 64)]
+	sfi_pld	src, #(2 * 64)
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
-	ldrmi	tmp1, [src], #4
-	strmi	tmp1, [dst], #4
+	sfi_breg src, \
+	ldrmi	tmp1, [\B], #4
+	sfi_breg dst, \
+	strmi	tmp1, [\B], #4
 	lsls	tmp2, tmp2, #2
-	ldrbne	tmp1, [src], #1
-	ldrhcs	tmp2, [src], #2
-	strbne	tmp1, [dst], #1
-	strhcs	tmp2, [dst], #2
+	sfi_breg src, \
+	ldrbne	tmp1, [\B], #1
+	sfi_breg src, \
+	ldrhcs	tmp2, [\B], #2
+	sfi_breg dst, \
+	strbne	tmp1, [\B], #1
+	sfi_breg dst, \
+	strhcs	tmp2, [\B], #2
 1:
-	pld	[src, #(3 * 64)]
+	sfi_pld	src, #(3 * 64)
 	subs	count, count, #64
 	ldrmi	tmp2, [sp], #FRAME_SIZE
 	bmi	.Ltail63unaligned
-	pld	[src, #(4 * 64)]
+	sfi_pld	src, #(4 * 64)
 
 #ifdef USE_NEON
-	vld1.8	{d0-d3}, [src]!
-	vld1.8	{d4-d7}, [src]!
+	/* These need an extra layer of macro just to work around a
+	   bug in the assembler's parser when an operand starts with
+	   a {...}.  */
+	.macro neon_load_multi reglist, basereg
+	vld1.8	{\reglist}, [\basereg]!
+	.endm
+	.macro neon_store_multi reglist, basereg
+	vst1.8	{\reglist}, [ALIGN (\basereg, 64)]!
+	.endm
+
+	/* These are used by the NaCl sfi_breg macro.  */
+	.macro _sfi_breg_dmask_neon_load_multi reg
+	_sfi_dmask \reg
+	.endm
+	.macro _sfi_breg_dmask_neon_store_multi reg
+	_sfi_dmask \reg
+	.endm
+
+	sfi_breg src, neon_load_multi d0-d3, \B
+	sfi_breg src, neon_load_multi d4-d7, \B
 	subs	count, count, #64
 	bmi	2f
 1:
-	pld	[src, #(4 * 64)]
-	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
-	vld1.8	{d0-d3}, [src]!
-	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
-	vld1.8	{d4-d7}, [src]!
+	sfi_pld	src, #(4 * 64)
+	sfi_breg dst, neon_store_multi d0-d3, \B
+	sfi_breg src, neon_load_multi d0-d3, \B
+	sfi_breg dst, neon_store_multi d4-d7, \B
+	sfi_breg src, neon_load_multi d4-d7, \B
 	subs	count, count, #64
 	bpl	1b
 2:
-	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
-	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	sfi_breg dst, neon_store_multi d0-d3, \B
+	sfi_breg dst, neon_store_multi d4-d7, \B
 	ands	count, count, #0x3f
 #else
 	/* Use an SMS style loop to maximize the I/O bandwidth.  */
 	sub	src, src, #4
 	sub	dst, dst, #8
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
-	ldr	A_l, [src, #4]
-	ldr	A_h, [src, #8]
+	sfi_breg src, \
+	ldr	A_l, [\B, #4]
+	sfi_breg src, \
+	ldr	A_h, [\B, #8]
 	strd	B_l, B_h, [sp, #8]
 	cfi_rel_offset (B_l, 8)
 	cfi_rel_offset (B_h, 12)
-	ldr	B_l, [src, #12]
-	ldr	B_h, [src, #16]
+	sfi_breg src, \
+	ldr	B_l, [\B, #12]
+	sfi_breg src, \
+	ldr	B_h, [\B, #16]
 	strd	C_l, C_h, [sp, #16]
 	cfi_rel_offset (C_l, 16)
 	cfi_rel_offset (C_h, 20)
-	ldr	C_l, [src, #20]
-	ldr	C_h, [src, #24]
+	sfi_breg src, \
+	ldr	C_l, [\B, #20]
+	sfi_breg src, \
+	ldr	C_h, [\B, #24]
 	strd	D_l, D_h, [sp, #24]
 	cfi_rel_offset (D_l, 24)
 	cfi_rel_offset (D_h, 28)
-	ldr	D_l, [src, #28]
-	ldr	D_h, [src, #32]!
+	sfi_breg src, \
+	ldr	D_l, [\B, #28]
+	sfi_breg src, \
+	ldr	D_h, [\B, #32]!
 	b	1f
 	.p2align	6
 2:
-	pld	[src, #(5 * 64) - (32 - 4)]
-	strd	A_l, A_h, [dst, #40]
-	ldr	A_l, [src, #36]
-	ldr	A_h, [src, #40]
-	strd	B_l, B_h, [dst, #48]
-	ldr	B_l, [src, #44]
-	ldr	B_h, [src, #48]
-	strd	C_l, C_h, [dst, #56]
-	ldr	C_l, [src, #52]
-	ldr	C_h, [src, #56]
-	strd	D_l, D_h, [dst, #64]!
-	ldr	D_l, [src, #60]
-	ldr	D_h, [src, #64]!
+	sfi_pld	src, #(5 * 64) - (32 - 4)
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
+	sfi_breg src, \
+	ldr	A_l, [\B, #36]
+	sfi_breg src, \
+	ldr	A_h, [\B, #40]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #48]
+	sfi_breg src, \
+	ldr	B_l, [\B, #44]
+	sfi_breg src, \
+	ldr	B_h, [\B, #48]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #56]
+	sfi_breg src, \
+	ldr	C_l, [\B, #52]
+	sfi_breg src, \
+	ldr	C_h, [\B, #56]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #64]!
+	sfi_breg src, \
+	ldr	D_l, [\B, #60]
+	sfi_breg src, \
+	ldr	D_h, [\B, #64]!
 	subs	tmp2, tmp2, #64
 1:
-	strd	A_l, A_h, [dst, #8]
-	ldr	A_l, [src, #4]
-	ldr	A_h, [src, #8]
-	strd	B_l, B_h, [dst, #16]
-	ldr	B_l, [src, #12]
-	ldr	B_h, [src, #16]
-	strd	C_l, C_h, [dst, #24]
-	ldr	C_l, [src, #20]
-	ldr	C_h, [src, #24]
-	strd	D_l, D_h, [dst, #32]
-	ldr	D_l, [src, #28]
-	ldr	D_h, [src, #32]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #8]
+	sfi_breg src, \
+	ldr	A_l, [\B, #4]
+	sfi_breg src, \
+	ldr	A_h, [\B, #8]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #16]
+	sfi_breg src, \
+	ldr	B_l, [\B, #12]
+	sfi_breg src, \
+	ldr	B_h, [\B, #16]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #24]
+	sfi_breg src, \
+	ldr	C_l, [\B, #20]
+	sfi_breg src, \
+	ldr	C_h, [\B, #24]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #32]
+	sfi_breg src, \
+	ldr	D_l, [\B, #28]
+	sfi_breg src, \
+	ldr	D_h, [\B, #32]
 	bcs	2b
 
 	/* Save the remaining bytes and restore the callee-saved regs.  */
-	strd	A_l, A_h, [dst, #40]
+	sfi_breg dst, \
+	strd	A_l, A_h, [\B, #40]
 	add	src, src, #36
-	strd	B_l, B_h, [dst, #48]
+	sfi_breg dst, \
+	strd	B_l, B_h, [\B, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	cfi_restore (B_l)
 	cfi_restore (B_h)
-	strd	C_l, C_h, [dst, #56]
+	sfi_breg dst, \
+	strd	C_l, C_h, [\B, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	cfi_restore (C_l)
 	cfi_restore (C_h)
-	strd	D_l, D_h, [dst, #64]
+	sfi_breg dst, \
+	strd	D_l, D_h, [\B, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	cfi_restore (D_l)
 	cfi_restore (D_h)