From: "Shih-Yuan Lee (FourDollars)" <sylee@canonical.com>
To: patches@eglibc.org, libc-ports@sourceware.org
Cc: rex.tsai@canonical.com, jesse.sung@canonical.com,
yc.cheng@canonical.com, Shih-Yuan Lee <fourdollars@gmail.com>
Subject: [PATCH] ARM: NEON detected memcpy.
Date: Wed, 03 Apr 2013 07:58:00 -0000 [thread overview]
Message-ID: <CAAT15mNnqeb6tuVdV6b4uJf-qFDH1acxevyW6f-gH+SkguENmg@mail.gmail.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 560 bytes --]
Hi,
I am working on the NEON detected memcpy.
This is based on what Siarhei Siamashka did at 2009 [1].
The idea is to use HWCAP and check NEON bit.
If there is a NEON bit, using NEON optimized memcpy.
If not, using the original memcpy instead.
If using NEON optimized memcpy, the performance of memcpy will be
raised up by about 50% [2].
How do you think about this idea? Any comment is welcome.
[1]: http://sourceware.org/ml/libc-ports/2009-07/msg00003.html
[2]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka13544.html
Regards,
$4
[-- Attachment #2: 0001-ARM-NEON-optimized-implementation-of-memcpy.patch --]
[-- Type: application/octet-stream, Size: 4198 bytes --]
From 8d746bb4e05cab5a5430e59653ddac2d6cb62e32 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sun, 5 Jul 2009 18:21:03 +0300
Subject: [PATCH 1/2] ARM: NEON optimized implementation of memcpy.
---
ports/sysdeps/arm/memcpy.S | 132 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 132 insertions(+)
diff --git a/ports/sysdeps/arm/memcpy.S b/ports/sysdeps/arm/memcpy.S
index add82e2..c1b1357 100644
--- a/ports/sysdeps/arm/memcpy.S
+++ b/ports/sysdeps/arm/memcpy.S
@@ -2,6 +2,7 @@
This file is part of the GNU C Library.
Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
+ NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -22,6 +23,135 @@
#include <sysdep.h>
#include <arm-features.h>
+#ifdef __ARM_NEON__
+ .text
+ .fpu neon
+
+/*
+ * ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
+ * of unaligned load/store memory accesses supported since ARMv6. This
+ * will further improve performance, but can purely theoretically cause
+ * problems if somebody decides to set SCTLR.A bit in the OS kernel
+ * (to trap each unaligned memory access) or somehow mess with strongly
+ * ordered/device memory.
+ */
+
+#define NEON_MAX_PREFETCH_DISTANCE 320
+
+ENTRY(memcpy)
+ mov ip, r0
+ cmp r2, #16
+ blt 4f @ Have less than 16 bytes to copy
+
+ @ First ensure 16 byte alignment for the destination buffer
+ tst r0, #0xF
+ beq 2f
+ tst r0, #1
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+ subne r2, r2, #1
+ tst ip, #2
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+ ldrneh r3, [r1], #2
+ strneh r3, [ip], #2
+#else
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+#endif
+ subne r2, r2, #2
+
+ tst ip, #4
+ beq 1f
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+ sub r2, r2, #4
+1:
+ tst ip, #8
+ beq 2f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [ip, :64]!
+ sub r2, r2, #8
+2:
+ subs r2, r2, #32
+ blt 3f
+ mov r3, #32
+
+ @ Main copy loop, 32 bytes are processed per iteration.
+ @ ARM instructions are used for doing fine-grained prefetch,
+ @ increasing prefetch distance progressively up to
+ @ NEON_MAX_PREFETCH_DISTANCE at runtime
+1:
+ vld1.8 {d0-d3}, [r1]!
+ cmp r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
+ pld [r1, r3]
+ addle r3, r3, #32
+ vst1.8 {d0-d3}, [ip, :128]!
+ sub r2, r2, #32
+ cmp r2, r3
+ bge 1b
+ cmp r2, #0
+ blt 3f
+1: @ Copy the remaining part of the buffer (already prefetched)
+ vld1.8 {d0-d3}, [r1]!
+ subs r2, r2, #32
+ vst1.8 {d0-d3}, [ip, :128]!
+ bge 1b
+3: @ Copy up to 31 remaining bytes
+ tst r2, #16
+ beq 4f
+ vld1.8 {d0, d1}, [r1]!
+ vst1.8 {d0, d1}, [ip, :128]!
+4:
+ @ Use ARM instructions exclusively for the final trailing part
+ @ not fully fitting into full 16 byte aligned block in order
+ @ to avoid "ARM store after NEON store" hazard. Also NEON
+ @ pipeline will be (mostly) flushed by the time when the
+ @ control returns to the caller, making the use of NEON mostly
+ @ transparent (and avoiding hazards in the caller code)
+
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+ movs r3, r2, lsl #29
+ ldrcs r3, [r1], #4
+ strcs r3, [ip], #4
+ ldrcs r3, [r1], #4
+ strcs r3, [ip], #4
+ ldrmi r3, [r1], #4
+ strmi r3, [ip], #4
+ movs r2, r2, lsl #31
+ ldrcsh r3, [r1], #2
+ strcsh r3, [ip], #2
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+#else
+ movs r3, r2, lsl #29
+ bcc 1f
+ .rept 8
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ .endr
+1:
+ bpl 1f
+ .rept 4
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+ .endr
+1:
+ movs r2, r2, lsl #31
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+#endif
+ bx lr
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
+
+#else
+
/*
* Data preload for architectures that support it (ARM V5TE and above)
*/
@@ -342,3 +472,5 @@ ENTRY(memcpy)
END(memcpy)
libc_hidden_builtin_def (memcpy)
+
+#endif
--
1.7.10.4
[-- Attachment #3: 0002-ARM-NEON-detected-memcpy.patch --]
[-- Type: application/octet-stream, Size: 4577 bytes --]
From 64299d7bd853314bc3bc96853220461533f26069 Mon Sep 17 00:00:00 2001
From: "Shih-Yuan Lee (FourDollars)" <sylee@canonical.com>
Date: Wed, 3 Apr 2013 14:07:37 +0800
Subject: [PATCH 2/2] ARM: NEON detected memcpy.
---
ports/sysdeps/arm/memcpy.S | 120 +++++++++++++++++++++++++++++---------------
1 file changed, 79 insertions(+), 41 deletions(-)
diff --git a/ports/sysdeps/arm/memcpy.S b/ports/sysdeps/arm/memcpy.S
index c1b1357..94fe8e2 100644
--- a/ports/sysdeps/arm/memcpy.S
+++ b/ports/sysdeps/arm/memcpy.S
@@ -3,6 +3,8 @@
Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
+ NEON detection contributed by Canonical Ltd. (written by Shih-Yuan Lee
+ aka FourDollars)
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -23,9 +25,36 @@
#include <sysdep.h>
#include <arm-features.h>
-#ifdef __ARM_NEON__
- .text
- .fpu neon
+/*
+ * Data preload for architectures that support it (ARM V5TE and above)
+ */
+#if (!defined (__ARM_ARCH_2__) && !defined (__ARM_ARCH_3__) \
+ && !defined (__ARM_ARCH_3M__) && !defined (__ARM_ARCH_4__) \
+ && !defined (__ARM_ARCH_4T__) && !defined (__ARM_ARCH_5__) \
+ && !defined (__ARM_ARCH_5T__))
+#define PLD(code...) code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * This can be used to enable code to cacheline align the source pointer.
+ * Experiments on tested architectures (StrongARM and XScale) didn't show
+ * this a worthwhile thing to do. That might be different in the future.
+ */
+//#define CALGN(code...) code
+#define CALGN(code...)
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define PULL lsr
+#define PUSH lsl
+#else
+#define PULL lsl
+#define PUSH lsr
+#endif
/*
* ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
@@ -36,9 +65,44 @@
* ordered/device memory.
*/
+#ifdef __ARM_FEATURE_UNALIGNED
+#define ENABLE_UNALIGNED_MEM_ACCESSES 1
+#endif
+
#define NEON_MAX_PREFETCH_DISTANCE 320
+ .text
+ .fpu neon
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
ENTRY(memcpy)
+ stmfd sp!, {r0, r1}
+
+ @ Check if there is a NEON extension.
+#ifdef IS_IN_rtld
+ ldr a1, 5f
+ ldr a2, Lrtld_local_ro
+0: add a1, pc, a1
+ add a1, a1, a2
+ ldr a1, [a1, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET]
+#else
+#ifdef PIC
+ ldr a1, 5f
+ ldr a2, Lrtld_global_ro
+0: add a1, pc, a1
+ ldr a1, [a1, a2]
+ ldr a1, [a1, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET]
+#else
+ ldr a1, Lhwcap
+ ldr a1, [a1, #0]
+#endif
+#endif
+ tst a1, #HWCAP_ARM_NEON
+ ldmfd sp!, {r0, r1}
+ beq Lno_neon
+
+ @ Optimized memcpy by NEON extension.
mov ip, r0
cmp r2, #16
blt 4f @ Have less than 16 bytes to copy
@@ -147,49 +211,24 @@ ENTRY(memcpy)
strmib r3, [ip], #1
#endif
bx lr
-END(memcpy)
-libc_hidden_builtin_def (memcpy)
+#ifdef IS_IN_rtld
+5: .long _GLOBAL_OFFSET_TABLE_ - 0b - 8
+Lrtld_local_ro:
+ .long C_SYMBOL_NAME(_rtld_local_ro)(GOTOFF)
#else
-
-/*
- * Data preload for architectures that support it (ARM V5TE and above)
- */
-#if (!defined (__ARM_ARCH_2__) && !defined (__ARM_ARCH_3__) \
- && !defined (__ARM_ARCH_3M__) && !defined (__ARM_ARCH_4__) \
- && !defined (__ARM_ARCH_4T__) && !defined (__ARM_ARCH_5__) \
- && !defined (__ARM_ARCH_5T__))
-#define PLD(code...) code
+#ifdef PIC
+5: .long _GLOBAL_OFFSET_TABLE_ - 0b - 8
+Lrtld_global_ro:
+ .long C_SYMBOL_NAME(_rtld_global_ro)(GOT)
#else
-#define PLD(code...)
+Lhwcap:
+ .long C_SYMBOL_NAME(_dl_hwcap)
#endif
-
-/*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do. That might be different in the future.
- */
-//#define CALGN(code...) code
-#define CALGN(code...)
-
-/*
- * Endian independent macros for shifting bytes within registers.
- */
-#ifndef __ARMEB__
-#define PULL lsr
-#define PUSH lsl
-#else
-#define PULL lsl
-#define PUSH lsr
#endif
- .text
- .syntax unified
-
-/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
-
-ENTRY(memcpy)
-
+Lno_neon:
+ @ Generic ARM memcpy.
push {r0, r4, lr}
cfi_adjust_cfa_offset (12)
cfi_rel_offset (r4, 4)
@@ -473,4 +512,3 @@ ENTRY(memcpy)
END(memcpy)
libc_hidden_builtin_def (memcpy)
-#endif
--
1.7.10.4
next reply other threads:[~2013-04-03 7:58 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-04-03 7:58 Shih-Yuan Lee (FourDollars) [this message]
2013-04-03 8:15 ` Will Newton
2013-04-03 9:19 ` Ondřej Bílka
2013-04-03 15:08 ` Joseph S. Myers
2013-04-03 15:48 ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:02 ` Joseph S. Myers
2013-04-04 3:56 ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:20 ` [Patches] " Ondřej Bílka
2013-04-04 4:15 ` Shih-Yuan Lee (FourDollars)
2013-04-04 6:37 ` Ondřej Bílka
2013-04-08 9:12 ` Will Newton
2013-04-08 10:27 ` Ondřej Bílka
2013-04-09 8:45 ` Richard Earnshaw
2013-04-09 9:05 ` Richard Earnshaw
2013-04-09 12:04 ` Ondřej Bílka
2013-04-09 12:59 ` Carlos O'Donell
2013-04-09 15:00 ` Richard Earnshaw
2013-04-09 15:54 ` Ondřej Bílka
2013-04-09 15:59 ` Carlos O'Donell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAAT15mNnqeb6tuVdV6b4uJf-qFDH1acxevyW6f-gH+SkguENmg@mail.gmail.com \
--to=sylee@canonical.com \
--cc=fourdollars@gmail.com \
--cc=jesse.sung@canonical.com \
--cc=libc-ports@sourceware.org \
--cc=patches@eglibc.org \
--cc=rex.tsai@canonical.com \
--cc=yc.cheng@canonical.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).