[PATCH] ARM: NEON detected memcpy. - Shih-Yuan Lee (FourDollars)

public inbox for libc-ports@sourceware.org
 help / color / mirror / Atom feed

From: "Shih-Yuan Lee (FourDollars)" <sylee@canonical.com>
To: patches@eglibc.org, libc-ports@sourceware.org
Cc: rex.tsai@canonical.com, jesse.sung@canonical.com,
	yc.cheng@canonical.com, 	Shih-Yuan Lee <fourdollars@gmail.com>
Subject: [PATCH] ARM: NEON detected memcpy.
Date: Wed, 03 Apr 2013 07:58:00 -0000	[thread overview]
Message-ID: <CAAT15mNnqeb6tuVdV6b4uJf-qFDH1acxevyW6f-gH+SkguENmg@mail.gmail.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 560 bytes --]

Hi,

I am working on the NEON detected memcpy.
This is based on what Siarhei Siamashka did at 2009 [1].

The idea is to use HWCAP and check NEON bit.
If there is a NEON bit, using NEON optimized memcpy.
If not, using the original memcpy instead.

If using NEON optimized memcpy, the performance of memcpy will be
raised up by about 50% [2].

How do you think about this idea? Any comment is welcome.

[1]: http://sourceware.org/ml/libc-ports/2009-07/msg00003.html
[2]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka13544.html

Regards,
$4

[-- Attachment #2: 0001-ARM-NEON-optimized-implementation-of-memcpy.patch --]
[-- Type: application/octet-stream, Size: 4198 bytes --]

From 8d746bb4e05cab5a5430e59653ddac2d6cb62e32 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sun, 5 Jul 2009 18:21:03 +0300
Subject: [PATCH 1/2] ARM: NEON optimized implementation of memcpy.

---
 ports/sysdeps/arm/memcpy.S |  132 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/ports/sysdeps/arm/memcpy.S b/ports/sysdeps/arm/memcpy.S
index add82e2..c1b1357 100644
--- a/ports/sysdeps/arm/memcpy.S
+++ b/ports/sysdeps/arm/memcpy.S
@@ -2,6 +2,7 @@
    This file is part of the GNU C Library.
 
    Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
+   NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -22,6 +23,135 @@
 #include <sysdep.h>
 #include <arm-features.h>
 
+#ifdef __ARM_NEON__
+		.text
+		.fpu    neon
+
+/*
+ * ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
+ * of unaligned load/store memory accesses supported since ARMv6. This
+ * will further improve performance, but can purely theoretically cause
+ * problems if somebody decides to set SCTLR.A bit in the OS kernel
+ * (to trap each unaligned memory access) or somehow mess with strongly
+ * ordered/device memory.
+ */
+
+#define NEON_MAX_PREFETCH_DISTANCE 320
+
+ENTRY(memcpy)
+		mov	ip, r0
+		cmp	r2, #16
+		blt	4f	@ Have less than 16 bytes to copy
+
+		@ First ensure 16 byte alignment for the destination buffer
+		tst	r0, #0xF
+		beq	2f
+		tst	r0, #1
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		subne	r2, r2, #1
+		tst	ip, #2
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+		ldrneh	r3, [r1], #2
+		strneh	r3, [ip], #2
+#else
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+#endif
+		subne	r2, r2, #2
+
+		tst	ip, #4
+		beq	1f
+		vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
+		vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+		sub	r2, r2, #4
+1:
+		tst	ip, #8
+		beq	2f
+		vld1.8	{d0}, [r1]!
+		vst1.8	{d0}, [ip, :64]!
+		sub	r2, r2, #8
+2:
+		subs	r2, r2, #32
+		blt	3f
+		mov	r3, #32
+
+		@ Main copy loop, 32 bytes are processed per iteration.
+		@ ARM instructions are used for doing fine-grained prefetch,
+		@ increasing prefetch distance progressively up to
+		@ NEON_MAX_PREFETCH_DISTANCE at runtime
+1:
+		vld1.8	{d0-d3}, [r1]!
+		cmp	r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
+		pld	[r1, r3]
+		addle	r3, r3, #32
+		vst1.8	{d0-d3}, [ip, :128]!
+		sub	r2, r2, #32
+		cmp	r2, r3
+		bge	1b
+		cmp	r2, #0
+		blt	3f
+1:		@ Copy the remaining part of the buffer (already prefetched)
+		vld1.8	{d0-d3}, [r1]!
+		subs	r2, r2, #32
+		vst1.8	{d0-d3}, [ip, :128]!
+		bge	1b
+3:		@ Copy up to 31 remaining bytes
+		tst	r2, #16
+		beq	4f
+		vld1.8	{d0, d1}, [r1]!
+		vst1.8	{d0, d1}, [ip, :128]!
+4:
+		@ Use ARM instructions exclusively for the final trailing part
+		@ not fully fitting into full 16 byte aligned block in order
+		@ to avoid "ARM store after NEON store" hazard. Also NEON
+		@ pipeline will be (mostly) flushed by the time when the
+		@ control returns to the caller, making the use of NEON mostly
+		@ transparent (and avoiding hazards in the caller code)
+
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+		movs	r3, r2, lsl #29
+		ldrcs	r3, [r1], #4
+		strcs	r3, [ip], #4
+		ldrcs	r3, [r1], #4
+		strcs	r3, [ip], #4
+		ldrmi	r3, [r1], #4
+		strmi	r3, [ip], #4
+		movs	r2, r2, lsl #31
+		ldrcsh	r3, [r1], #2
+		strcsh	r3, [ip], #2
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+#else
+		movs	r3, r2, lsl #29
+		bcc	1f
+	.rept	8
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+	.endr
+1:
+		bpl	1f
+	.rept	4
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+	.endr
+1:
+		movs	r2, r2, lsl #31
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+#endif
+		bx	lr
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
+
+#else
+
 /*
  * Data preload for architectures that support it (ARM V5TE and above)
  */
@@ -342,3 +472,5 @@ ENTRY(memcpy)
 
 END(memcpy)
 libc_hidden_builtin_def (memcpy)
+
+#endif
-- 
1.7.10.4


[-- Attachment #3: 0002-ARM-NEON-detected-memcpy.patch --]
[-- Type: application/octet-stream, Size: 4577 bytes --]

From 64299d7bd853314bc3bc96853220461533f26069 Mon Sep 17 00:00:00 2001
From: "Shih-Yuan Lee (FourDollars)" <sylee@canonical.com>
Date: Wed, 3 Apr 2013 14:07:37 +0800
Subject: [PATCH 2/2] ARM: NEON detected memcpy.

---
 ports/sysdeps/arm/memcpy.S |  120 +++++++++++++++++++++++++++++---------------
 1 file changed, 79 insertions(+), 41 deletions(-)

diff --git a/ports/sysdeps/arm/memcpy.S b/ports/sysdeps/arm/memcpy.S
index c1b1357..94fe8e2 100644
--- a/ports/sysdeps/arm/memcpy.S
+++ b/ports/sysdeps/arm/memcpy.S
@@ -3,6 +3,8 @@
 
    Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
    NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
+   NEON detection contributed by Canonical Ltd. (written by Shih-Yuan Lee
+   aka FourDollars)
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -23,9 +25,36 @@
 #include <sysdep.h>
 #include <arm-features.h>
 
-#ifdef __ARM_NEON__
-		.text
-		.fpu    neon
+/*
+ * Data preload for architectures that support it (ARM V5TE and above)
+ */
+#if (!defined (__ARM_ARCH_2__) && !defined (__ARM_ARCH_3__) \
+     && !defined (__ARM_ARCH_3M__) && !defined (__ARM_ARCH_4__) \
+     && !defined (__ARM_ARCH_4T__) && !defined (__ARM_ARCH_5__) \
+     && !defined (__ARM_ARCH_5T__))
+#define PLD(code...)    code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * This can be used to enable code to cacheline align the source pointer.
+ * Experiments on tested architectures (StrongARM and XScale) didn't show
+ * this a worthwhile thing to do.  That might be different in the future.
+ */
+//#define CALGN(code...)        code
+#define CALGN(code...)
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define PULL            lsr
+#define PUSH            lsl
+#else
+#define PULL            lsl
+#define PUSH            lsr
+#endif
 
 /*
  * ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
@@ -36,9 +65,44 @@
  * ordered/device memory.
  */
 
+#ifdef __ARM_FEATURE_UNALIGNED
+#define ENABLE_UNALIGNED_MEM_ACCESSES 1
+#endif
+
 #define NEON_MAX_PREFETCH_DISTANCE 320
 
+		.text
+		.fpu    neon
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
 ENTRY(memcpy)
+		stmfd	sp!, {r0, r1}
+
+		@ Check if there is a NEON extension.
+#ifdef IS_IN_rtld
+		ldr	a1, 5f
+		ldr	a2, Lrtld_local_ro
+0:		add	a1, pc, a1
+		add	a1, a1, a2
+		ldr	a1, [a1, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET]
+#else
+#ifdef PIC
+		ldr	a1, 5f
+		ldr	a2, Lrtld_global_ro
+0:		add	a1, pc, a1
+		ldr	a1, [a1, a2]
+		ldr	a1, [a1, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET]
+#else
+		ldr	a1, Lhwcap
+		ldr	a1, [a1, #0]
+#endif
+#endif
+		tst	a1, #HWCAP_ARM_NEON
+		ldmfd	sp!, {r0, r1}
+		beq	Lno_neon
+
+		@ Optimized memcpy by NEON extension.
 		mov	ip, r0
 		cmp	r2, #16
 		blt	4f	@ Have less than 16 bytes to copy
@@ -147,49 +211,24 @@ ENTRY(memcpy)
 		strmib	r3, [ip], #1
 #endif
 		bx	lr
-END(memcpy)
-libc_hidden_builtin_def (memcpy)
 
+#ifdef IS_IN_rtld
+5:	.long	_GLOBAL_OFFSET_TABLE_ - 0b - 8
+Lrtld_local_ro:
+	.long	C_SYMBOL_NAME(_rtld_local_ro)(GOTOFF)
 #else
-
-/*
- * Data preload for architectures that support it (ARM V5TE and above)
- */
-#if (!defined (__ARM_ARCH_2__) && !defined (__ARM_ARCH_3__) \
-     && !defined (__ARM_ARCH_3M__) && !defined (__ARM_ARCH_4__) \
-     && !defined (__ARM_ARCH_4T__) && !defined (__ARM_ARCH_5__) \
-     && !defined (__ARM_ARCH_5T__))
-#define PLD(code...)    code
+#ifdef PIC
+5:	.long	_GLOBAL_OFFSET_TABLE_ - 0b - 8
+Lrtld_global_ro:
+	.long	C_SYMBOL_NAME(_rtld_global_ro)(GOT)
 #else
-#define PLD(code...)
+Lhwcap:
+	.long	C_SYMBOL_NAME(_dl_hwcap)
 #endif
-
-/*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do.  That might be different in the future.
- */
-//#define CALGN(code...)        code
-#define CALGN(code...)
-
-/*
- * Endian independent macros for shifting bytes within registers.
- */
-#ifndef __ARMEB__
-#define PULL            lsr
-#define PUSH            lsl
-#else
-#define PULL            lsl
-#define PUSH            lsr
 #endif
 
-		.text
-		.syntax unified
-
-/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
-
-ENTRY(memcpy)
-
+Lno_neon:
+		@ Generic ARM memcpy.
 		push	{r0, r4, lr}
 		cfi_adjust_cfa_offset (12)
 		cfi_rel_offset (r4, 4)
@@ -473,4 +512,3 @@ ENTRY(memcpy)
 END(memcpy)
 libc_hidden_builtin_def (memcpy)
 
-#endif
-- 
1.7.10.4

next             reply	other threads:[~2013-04-03  7:58 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-04-03  7:58 Shih-Yuan Lee (FourDollars) [this message]
2013-04-03  8:15 ` Will Newton
2013-04-03  9:19   ` Ondřej Bílka
2013-04-03 15:08 ` Joseph S. Myers
2013-04-03 15:48   ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:02     ` Joseph S. Myers
2013-04-04  3:56       ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:20     ` [Patches] " Ondřej Bílka
2013-04-04  4:15       ` Shih-Yuan Lee (FourDollars)
2013-04-04  6:37         ` Ondřej Bílka
2013-04-08  9:12           ` Will Newton
2013-04-08 10:27             ` Ondřej Bílka
2013-04-09  8:45         ` Richard Earnshaw
2013-04-09  9:05   ` Richard Earnshaw
2013-04-09 12:04     ` Ondřej Bílka
2013-04-09 12:59     ` Carlos O'Donell
2013-04-09 15:00       ` Richard Earnshaw
2013-04-09 15:54         ` Ondřej Bílka
2013-04-09 15:59         ` Carlos O'Donell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAAT15mNnqeb6tuVdV6b4uJf-qFDH1acxevyW6f-gH+SkguENmg@mail.gmail.com \
    --to=sylee@canonical.com \
    --cc=fourdollars@gmail.com \
    --cc=jesse.sung@canonical.com \
    --cc=libc-ports@sourceware.org \
    --cc=patches@eglibc.org \
    --cc=rex.tsai@canonical.com \
    --cc=yc.cheng@canonical.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).