* [PATCH] ARM: NEON detected memcpy.
@ 2013-04-03 7:58 Shih-Yuan Lee (FourDollars)
2013-04-03 8:15 ` Will Newton
2013-04-03 15:08 ` Joseph S. Myers
0 siblings, 2 replies; 19+ messages in thread
From: Shih-Yuan Lee (FourDollars) @ 2013-04-03 7:58 UTC (permalink / raw)
To: patches, libc-ports; +Cc: rex.tsai, jesse.sung, yc.cheng, Shih-Yuan Lee
[-- Attachment #1: Type: text/plain, Size: 560 bytes --]
Hi,
I am working on the NEON detected memcpy.
This is based on what Siarhei Siamashka did at 2009 [1].
The idea is to use HWCAP and check NEON bit.
If there is a NEON bit, using NEON optimized memcpy.
If not, using the original memcpy instead.
If using NEON optimized memcpy, the performance of memcpy will be
raised up by about 50% [2].
How do you think about this idea? Any comment is welcome.
[1]: http://sourceware.org/ml/libc-ports/2009-07/msg00003.html
[2]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka13544.html
Regards,
$4
[-- Attachment #2: 0001-ARM-NEON-optimized-implementation-of-memcpy.patch --]
[-- Type: application/octet-stream, Size: 4198 bytes --]
From 8d746bb4e05cab5a5430e59653ddac2d6cb62e32 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sun, 5 Jul 2009 18:21:03 +0300
Subject: [PATCH 1/2] ARM: NEON optimized implementation of memcpy.
---
ports/sysdeps/arm/memcpy.S | 132 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 132 insertions(+)
diff --git a/ports/sysdeps/arm/memcpy.S b/ports/sysdeps/arm/memcpy.S
index add82e2..c1b1357 100644
--- a/ports/sysdeps/arm/memcpy.S
+++ b/ports/sysdeps/arm/memcpy.S
@@ -2,6 +2,7 @@
This file is part of the GNU C Library.
Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
+ NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -22,6 +23,135 @@
#include <sysdep.h>
#include <arm-features.h>
+#ifdef __ARM_NEON__
+ .text
+ .fpu neon
+
+/*
+ * ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
+ * of unaligned load/store memory accesses supported since ARMv6. This
+ * will further improve performance, but can purely theoretically cause
+ * problems if somebody decides to set SCTLR.A bit in the OS kernel
+ * (to trap each unaligned memory access) or somehow mess with strongly
+ * ordered/device memory.
+ */
+
+#define NEON_MAX_PREFETCH_DISTANCE 320
+
+ENTRY(memcpy)
+ mov ip, r0
+ cmp r2, #16
+ blt 4f @ Have less than 16 bytes to copy
+
+ @ First ensure 16 byte alignment for the destination buffer
+ tst r0, #0xF
+ beq 2f
+ tst r0, #1
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+ subne r2, r2, #1
+ tst ip, #2
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+ ldrneh r3, [r1], #2
+ strneh r3, [ip], #2
+#else
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+ ldrneb r3, [r1], #1
+ strneb r3, [ip], #1
+#endif
+ subne r2, r2, #2
+
+ tst ip, #4
+ beq 1f
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+ sub r2, r2, #4
+1:
+ tst ip, #8
+ beq 2f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [ip, :64]!
+ sub r2, r2, #8
+2:
+ subs r2, r2, #32
+ blt 3f
+ mov r3, #32
+
+ @ Main copy loop, 32 bytes are processed per iteration.
+ @ ARM instructions are used for doing fine-grained prefetch,
+ @ increasing prefetch distance progressively up to
+ @ NEON_MAX_PREFETCH_DISTANCE at runtime
+1:
+ vld1.8 {d0-d3}, [r1]!
+ cmp r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
+ pld [r1, r3]
+ addle r3, r3, #32
+ vst1.8 {d0-d3}, [ip, :128]!
+ sub r2, r2, #32
+ cmp r2, r3
+ bge 1b
+ cmp r2, #0
+ blt 3f
+1: @ Copy the remaining part of the buffer (already prefetched)
+ vld1.8 {d0-d3}, [r1]!
+ subs r2, r2, #32
+ vst1.8 {d0-d3}, [ip, :128]!
+ bge 1b
+3: @ Copy up to 31 remaining bytes
+ tst r2, #16
+ beq 4f
+ vld1.8 {d0, d1}, [r1]!
+ vst1.8 {d0, d1}, [ip, :128]!
+4:
+ @ Use ARM instructions exclusively for the final trailing part
+ @ not fully fitting into full 16 byte aligned block in order
+ @ to avoid "ARM store after NEON store" hazard. Also NEON
+ @ pipeline will be (mostly) flushed by the time when the
+ @ control returns to the caller, making the use of NEON mostly
+ @ transparent (and avoiding hazards in the caller code)
+
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+ movs r3, r2, lsl #29
+ ldrcs r3, [r1], #4
+ strcs r3, [ip], #4
+ ldrcs r3, [r1], #4
+ strcs r3, [ip], #4
+ ldrmi r3, [r1], #4
+ strmi r3, [ip], #4
+ movs r2, r2, lsl #31
+ ldrcsh r3, [r1], #2
+ strcsh r3, [ip], #2
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+#else
+ movs r3, r2, lsl #29
+ bcc 1f
+ .rept 8
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ .endr
+1:
+ bpl 1f
+ .rept 4
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+ .endr
+1:
+ movs r2, r2, lsl #31
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ ldrcsb r3, [r1], #1
+ strcsb r3, [ip], #1
+ ldrmib r3, [r1], #1
+ strmib r3, [ip], #1
+#endif
+ bx lr
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
+
+#else
+
/*
* Data preload for architectures that support it (ARM V5TE and above)
*/
@@ -342,3 +472,5 @@ ENTRY(memcpy)
END(memcpy)
libc_hidden_builtin_def (memcpy)
+
+#endif
--
1.7.10.4
[-- Attachment #3: 0002-ARM-NEON-detected-memcpy.patch --]
[-- Type: application/octet-stream, Size: 4577 bytes --]
From 64299d7bd853314bc3bc96853220461533f26069 Mon Sep 17 00:00:00 2001
From: "Shih-Yuan Lee (FourDollars)" <sylee@canonical.com>
Date: Wed, 3 Apr 2013 14:07:37 +0800
Subject: [PATCH 2/2] ARM: NEON detected memcpy.
---
ports/sysdeps/arm/memcpy.S | 120 +++++++++++++++++++++++++++++---------------
1 file changed, 79 insertions(+), 41 deletions(-)
diff --git a/ports/sysdeps/arm/memcpy.S b/ports/sysdeps/arm/memcpy.S
index c1b1357..94fe8e2 100644
--- a/ports/sysdeps/arm/memcpy.S
+++ b/ports/sysdeps/arm/memcpy.S
@@ -3,6 +3,8 @@
Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
+ NEON detection contributed by Canonical Ltd. (written by Shih-Yuan Lee
+ aka FourDollars)
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -23,9 +25,36 @@
#include <sysdep.h>
#include <arm-features.h>
-#ifdef __ARM_NEON__
- .text
- .fpu neon
+/*
+ * Data preload for architectures that support it (ARM V5TE and above)
+ */
+#if (!defined (__ARM_ARCH_2__) && !defined (__ARM_ARCH_3__) \
+ && !defined (__ARM_ARCH_3M__) && !defined (__ARM_ARCH_4__) \
+ && !defined (__ARM_ARCH_4T__) && !defined (__ARM_ARCH_5__) \
+ && !defined (__ARM_ARCH_5T__))
+#define PLD(code...) code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * This can be used to enable code to cacheline align the source pointer.
+ * Experiments on tested architectures (StrongARM and XScale) didn't show
+ * this a worthwhile thing to do. That might be different in the future.
+ */
+//#define CALGN(code...) code
+#define CALGN(code...)
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define PULL lsr
+#define PUSH lsl
+#else
+#define PULL lsl
+#define PUSH lsr
+#endif
/*
* ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
@@ -36,9 +65,44 @@
* ordered/device memory.
*/
+#ifdef __ARM_FEATURE_UNALIGNED
+#define ENABLE_UNALIGNED_MEM_ACCESSES 1
+#endif
+
#define NEON_MAX_PREFETCH_DISTANCE 320
+ .text
+ .fpu neon
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
ENTRY(memcpy)
+ stmfd sp!, {r0, r1}
+
+ @ Check if there is a NEON extension.
+#ifdef IS_IN_rtld
+ ldr a1, 5f
+ ldr a2, Lrtld_local_ro
+0: add a1, pc, a1
+ add a1, a1, a2
+ ldr a1, [a1, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET]
+#else
+#ifdef PIC
+ ldr a1, 5f
+ ldr a2, Lrtld_global_ro
+0: add a1, pc, a1
+ ldr a1, [a1, a2]
+ ldr a1, [a1, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET]
+#else
+ ldr a1, Lhwcap
+ ldr a1, [a1, #0]
+#endif
+#endif
+ tst a1, #HWCAP_ARM_NEON
+ ldmfd sp!, {r0, r1}
+ beq Lno_neon
+
+ @ Optimized memcpy by NEON extension.
mov ip, r0
cmp r2, #16
blt 4f @ Have less than 16 bytes to copy
@@ -147,49 +211,24 @@ ENTRY(memcpy)
strmib r3, [ip], #1
#endif
bx lr
-END(memcpy)
-libc_hidden_builtin_def (memcpy)
+#ifdef IS_IN_rtld
+5: .long _GLOBAL_OFFSET_TABLE_ - 0b - 8
+Lrtld_local_ro:
+ .long C_SYMBOL_NAME(_rtld_local_ro)(GOTOFF)
#else
-
-/*
- * Data preload for architectures that support it (ARM V5TE and above)
- */
-#if (!defined (__ARM_ARCH_2__) && !defined (__ARM_ARCH_3__) \
- && !defined (__ARM_ARCH_3M__) && !defined (__ARM_ARCH_4__) \
- && !defined (__ARM_ARCH_4T__) && !defined (__ARM_ARCH_5__) \
- && !defined (__ARM_ARCH_5T__))
-#define PLD(code...) code
+#ifdef PIC
+5: .long _GLOBAL_OFFSET_TABLE_ - 0b - 8
+Lrtld_global_ro:
+ .long C_SYMBOL_NAME(_rtld_global_ro)(GOT)
#else
-#define PLD(code...)
+Lhwcap:
+ .long C_SYMBOL_NAME(_dl_hwcap)
#endif
-
-/*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do. That might be different in the future.
- */
-//#define CALGN(code...) code
-#define CALGN(code...)
-
-/*
- * Endian independent macros for shifting bytes within registers.
- */
-#ifndef __ARMEB__
-#define PULL lsr
-#define PUSH lsl
-#else
-#define PULL lsl
-#define PUSH lsr
#endif
- .text
- .syntax unified
-
-/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
-
-ENTRY(memcpy)
-
+Lno_neon:
+ @ Generic ARM memcpy.
push {r0, r4, lr}
cfi_adjust_cfa_offset (12)
cfi_rel_offset (r4, 4)
@@ -473,4 +512,3 @@ ENTRY(memcpy)
END(memcpy)
libc_hidden_builtin_def (memcpy)
-#endif
--
1.7.10.4
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-03 7:58 [PATCH] ARM: NEON detected memcpy Shih-Yuan Lee (FourDollars)
@ 2013-04-03 8:15 ` Will Newton
2013-04-03 9:19 ` Ondřej Bílka
2013-04-03 15:08 ` Joseph S. Myers
1 sibling, 1 reply; 19+ messages in thread
From: Will Newton @ 2013-04-03 8:15 UTC (permalink / raw)
To: Shih-Yuan Lee (FourDollars)
Cc: patches, libc-ports, rex.tsai, jesse.sung, yc.cheng, Shih-Yuan Lee
On 3 April 2013 08:58, Shih-Yuan Lee (FourDollars) <sylee@canonical.com> wrote:
> Hi,
>
> I am working on the NEON detected memcpy.
> This is based on what Siarhei Siamashka did at 2009 [1].
>
> The idea is to use HWCAP and check NEON bit.
> If there is a NEON bit, using NEON optimized memcpy.
> If not, using the original memcpy instead.
>
> If using NEON optimized memcpy, the performance of memcpy will be
> raised up by about 50% [2].
>
> How do you think about this idea? Any comment is welcome.
Hi,
I am working on a similar project within Linaro, which is to add the
NEON/VFP capable memcpy from cortex-strings[1] to glibc. However I am
looking at enabling it at runtime via indirect functions which makes
it slightly more complex than just importing the cortex strings code,
so I don't have any patches to show you just yet.
[1] https://launchpad.net/cortex-strings
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-03 8:15 ` Will Newton
@ 2013-04-03 9:19 ` Ondřej Bílka
0 siblings, 0 replies; 19+ messages in thread
From: Ondřej Bílka @ 2013-04-03 9:19 UTC (permalink / raw)
To: Will Newton
Cc: Shih-Yuan Lee (FourDollars),
patches, libc-ports, rex.tsai, jesse.sung, yc.cheng,
Shih-Yuan Lee
[-- Attachment #1: Type: text/plain, Size: 1426 bytes --]
On Wed, Apr 03, 2013 at 09:15:46AM +0100, Will Newton wrote:
> On 3 April 2013 08:58, Shih-Yuan Lee (FourDollars) <sylee@canonical.com> wrote:
> > Hi,
> >
> > I am working on the NEON detected memcpy.
> > This is based on what Siarhei Siamashka did at 2009 [1].
> >
> > The idea is to use HWCAP and check NEON bit.
> > If there is a NEON bit, using NEON optimized memcpy.
> > If not, using the original memcpy instead.
> >
> > If using NEON optimized memcpy, the performance of memcpy will be
> > raised up by about 50% [2].
> >
> > How do you think about this idea? Any comment is welcome.
>
> Hi,
>
> I am working on a similar project within Linaro, which is to add the
> NEON/VFP capable memcpy from cortex-strings[1] to glibc. However I am
> looking at enabling it at runtime via indirect functions which makes
> it slightly more complex than just importing the cortex strings code,
> so I don't have any patches to show you just yet.
>
> [1] https://launchpad.net/cortex-strings
Hi,
You need to optimize header beacuse you typically copy less than 128 bytes.
My measurement how many 16 byte blocks are used is here.
http://kam.mff.cuni.cz/~ondra/benchmark_string/profile/result.html
If I had code to get number of cycles from perf counter I could provide
tool to see memcpy performance in arbitrary binary.
On x64 I used overlapping load/store to minimize branches. Try how attached
memcpy works on small inputs.
[-- Attachment #2: memcpy_generic.c --]
[-- Type: text/plain, Size: 2048 bytes --]
#include <stdint.h>
#include <stdlib.h>
/* Align VALUE down by ALIGN bytes. */
#define ALIGN_DOWN(value, align) \
ALIGN_DOWN_M1(value, align - 1)
/* Align VALUE down by ALIGN_M1 + 1 bytes.
Useful if you have precomputed ALIGN - 1. */
#define ALIGN_DOWN_M1(value, align_m1) \
(void *)((uintptr_t)(value) \
& ~(uintptr_t)(align_m1))
/* Align VALUE up by ALIGN bytes. */
#define ALIGN_UP(value, align) \
ALIGN_UP_M1(value, align - 1)
/* Align VALUE up by ALIGN_M1 + 1 bytes.
Useful if you have precomputed ALIGN - 1. */
#define ALIGN_UP_M1(value, align_m1) \
(void *)(((uintptr_t)(value) + (uintptr_t)(align_m1)) \
& ~(uintptr_t)(align_m1))
#define STOREU(x,y) STORE(x,y)
#define STORE(x,y) ((uint64_t*)(x))[0]=((uint64_t*)(y))[0]; ((uint64_t*)(x))[1]=((uint64_t*)(y))[1];
#define LOAD(x) x
#define LOADU(x) x
static char *memcpy_small (char *dest, char *src, size_t no, char *ret);
void *memcpy_new_u(char *dest, char *src, size_t n)
{
char *from,*to;
if (n < 16)
{
return memcpy_small(dest, src, n, dest);
}
else
{
STOREU(dest, LOADU(src));
STOREU(dest + n - 16, LOADU(src + n - 16));
to = ALIGN_DOWN(dest + n, 16);
from = ALIGN_DOWN(src + 16, 16);
dest += src - from;
src = from;
from = dest;
while (from != to)
{
STOREU(from, LOAD(src));
from += 16;
src += 16;
}
}
return dest;
}
static char *memcpy_small (char *dest, char *src, size_t no, char *ret)
{
if (no & (8 + 16))
{
((uint64_t *) dest)[0] = ((uint64_t *) src)[0];
((uint64_t *)(dest + no - 8))[0] = ((uint64_t *)(src + no - 8))[0];
return ret;
}
if (no & 4)
{
((uint32_t *) dest)[0] = ((uint32_t *) src)[0];
((uint32_t *)(dest + no - 4))[0] = ((uint32_t *)(src + no - 4))[0];
return ret;
}
dest[0] = src[0];
if (no & 2)
{
((uint16_t *)(dest + no - 2))[0] = ((uint16_t *)(src + no - 2))[0];
return ret;
}
return ret;
}
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-03 7:58 [PATCH] ARM: NEON detected memcpy Shih-Yuan Lee (FourDollars)
2013-04-03 8:15 ` Will Newton
@ 2013-04-03 15:08 ` Joseph S. Myers
2013-04-03 15:48 ` Shih-Yuan Lee (FourDollars)
2013-04-09 9:05 ` Richard Earnshaw
1 sibling, 2 replies; 19+ messages in thread
From: Joseph S. Myers @ 2013-04-03 15:08 UTC (permalink / raw)
To: Shih-Yuan Lee (FourDollars)
Cc: patches, libc-ports, rex.tsai, jesse.sung, yc.cheng, Shih-Yuan Lee
On Wed, 3 Apr 2013, Shih-Yuan Lee (FourDollars) wrote:
> I am working on the NEON detected memcpy.
> This is based on what Siarhei Siamashka did at 2009 [1].
I still don't see any copyright assignment on file (whether individual
with an employer disclaimer, or corporate) that would cover that work.
Without one, we can't use it, and I advise anyone working on NEON memcpy
not to look at it.
I was previously told by people at ARM that NEON memcpy wasn't a good idea
in practice because of raised power consumption, context switch costs etc.
from using NEON in processes that otherwise didn't use it, even if it
appeared superficially beneficial in benchmarks.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-03 15:08 ` Joseph S. Myers
@ 2013-04-03 15:48 ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:02 ` Joseph S. Myers
2013-04-03 16:20 ` [Patches] " Ondřej Bílka
2013-04-09 9:05 ` Richard Earnshaw
1 sibling, 2 replies; 19+ messages in thread
From: Shih-Yuan Lee (FourDollars) @ 2013-04-03 15:48 UTC (permalink / raw)
To: Joseph S. Myers; +Cc: patches, libc-ports, rex.tsai, Jesse Sung, YC Cheng
Hi Joseph,
I know there is some legal homework but I don't know how to do it.
Could you provide more details about how to put such copyright
assignment (with some real example is better)?
About raised power consumption and context switch costs, I may be able
to add some option in configure for the users to decide if they want
to use this feature or not.
How do you think?
Regards,
$4
On Wed, Apr 3, 2013 at 11:08 PM, Joseph S. Myers
<joseph@codesourcery.com> wrote:
> On Wed, 3 Apr 2013, Shih-Yuan Lee (FourDollars) wrote:
>
>> I am working on the NEON detected memcpy.
>> This is based on what Siarhei Siamashka did at 2009 [1].
>
> I still don't see any copyright assignment on file (whether individual
> with an employer disclaimer, or corporate) that would cover that work.
> Without one, we can't use it, and I advise anyone working on NEON memcpy
> not to look at it.
>
> I was previously told by people at ARM that NEON memcpy wasn't a good idea
> in practice because of raised power consumption, context switch costs etc.
> from using NEON in processes that otherwise didn't use it, even if it
> appeared superficially beneficial in benchmarks.
>
> --
> Joseph S. Myers
> joseph@codesourcery.com
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-03 15:48 ` Shih-Yuan Lee (FourDollars)
@ 2013-04-03 16:02 ` Joseph S. Myers
2013-04-04 3:56 ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:20 ` [Patches] " Ondřej Bílka
1 sibling, 1 reply; 19+ messages in thread
From: Joseph S. Myers @ 2013-04-03 16:02 UTC (permalink / raw)
To: Shih-Yuan Lee (FourDollars)
Cc: patches, libc-ports, rex.tsai, Jesse Sung, YC Cheng
On Wed, 3 Apr 2013, Shih-Yuan Lee (FourDollars) wrote:
> Hi Joseph,
>
> I know there is some legal homework but I don't know how to do it.
> Could you provide more details about how to put such copyright
> assignment (with some real example is better)?
See
<http://www.gnu.org/prep/maintain/html_node/Copyright-Papers.html>. To
use anything based on the 2009 work, either Nokia will need to complete a
corporate copyright assignment for that work, if it is copyright Nokia
pursuant to whatever employment contract Siarhei Siamashka had with Nokia
at the time, or Nokia will need to complete an employer disclaimer and
Siarhei Siamashka will need to complete an individual assignment. And the
same applies for anyone else who has made significant contributions to the
code.
> About raised power consumption and context switch costs, I may be able
> to add some option in configure for the users to decide if they want
> to use this feature or not.
> How do you think?
I suggest raising such a question of configure options for such trade-offs
more generally on libc-alpha.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Patches] [PATCH] ARM: NEON detected memcpy.
2013-04-03 15:48 ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:02 ` Joseph S. Myers
@ 2013-04-03 16:20 ` Ondřej Bílka
2013-04-04 4:15 ` Shih-Yuan Lee (FourDollars)
1 sibling, 1 reply; 19+ messages in thread
From: Ondřej Bílka @ 2013-04-03 16:20 UTC (permalink / raw)
To: Shih-Yuan Lee (FourDollars)
Cc: Joseph S. Myers, libc-ports, Jesse Sung, patches, YC Cheng, rex.tsai
On Wed, Apr 03, 2013 at 11:47:36PM +0800, Shih-Yuan Lee (FourDollars) wrote:
> Hi Joseph,
>
...
> > I was previously told by people at ARM that NEON memcpy wasn't a good idea
> > in practice because of raised power consumption, context switch costs etc.
> > from using NEON in processes that otherwise didn't use it, even if it
> > appeared superficially beneficial in benchmarks.
> >
> About raised power consumption and context switch costs, I may be able
> to add some option in configure for the users to decide if they want
> to use this feature or not.
> How do you think?
>
Configure option is bit overkill.
You need to compare neon/other implementation speed. Then determine
size where neon is faster if we include energy cost and context switch.
My first estimate is use neon when larger than 4096 bytes.
However to determine context switch cost of neon you must account network effect.
If you use neon in one function that is called sufficiently often (to
always save registers) then adding neon implementation for additional functions
does not increase cost.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-03 16:02 ` Joseph S. Myers
@ 2013-04-04 3:56 ` Shih-Yuan Lee (FourDollars)
0 siblings, 0 replies; 19+ messages in thread
From: Shih-Yuan Lee (FourDollars) @ 2013-04-04 3:56 UTC (permalink / raw)
To: Joseph S. Myers; +Cc: patches, libc-ports, rex.tsai, Jesse Sung, YC Cheng
Hi Joseph,
Thank you for your guidance.
I will try to contact Siarhei Siamashka and Nokia to see how we can
make things happening.
Regards,
$4
On Thu, Apr 4, 2013 at 12:02 AM, Joseph S. Myers
<joseph@codesourcery.com> wrote:
> On Wed, 3 Apr 2013, Shih-Yuan Lee (FourDollars) wrote:
>
>> Hi Joseph,
>>
>> I know there is some legal homework but I don't know how to do it.
>> Could you provide more details about how to put such copyright
>> assignment (with some real example is better)?
>
> See
> <http://www.gnu.org/prep/maintain/html_node/Copyright-Papers.html>. To
> use anything based on the 2009 work, either Nokia will need to complete a
> corporate copyright assignment for that work, if it is copyright Nokia
> pursuant to whatever employment contract Siarhei Siamashka had with Nokia
> at the time, or Nokia will need to complete an employer disclaimer and
> Siarhei Siamashka will need to complete an individual assignment. And the
> same applies for anyone else who has made significant contributions to the
> code.
>
>> About raised power consumption and context switch costs, I may be able
>> to add some option in configure for the users to decide if they want
>> to use this feature or not.
>> How do you think?
>
> I suggest raising such a question of configure options for such trade-offs
> more generally on libc-alpha.
>
> --
> Joseph S. Myers
> joseph@codesourcery.com
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Patches] [PATCH] ARM: NEON detected memcpy.
2013-04-03 16:20 ` [Patches] " Ondřej Bílka
@ 2013-04-04 4:15 ` Shih-Yuan Lee (FourDollars)
2013-04-04 6:37 ` Ondřej Bílka
2013-04-09 8:45 ` Richard Earnshaw
0 siblings, 2 replies; 19+ messages in thread
From: Shih-Yuan Lee (FourDollars) @ 2013-04-04 4:15 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Joseph S. Myers, libc-ports, Jesse Sung, patches, YC Cheng, rex.tsai
Hi Ondrej,
I do have some benchmark data.
--- Running benchmarks (average case/perfect alignment case) ---
very small data test:
memcpy_arm : (3 bytes copy) = 86.2 MB/s / 88.3 MB/s
memcpy_neon : (3 bytes copy) = 53.4 MB/s / 54.5 MB/s
memcpy_arm : (4 bytes copy) = 79.8 MB/s / 62.9 MB/s
memcpy_neon : (4 bytes copy) = 72.5 MB/s / 73.9 MB/s
memcpy_arm : (5 bytes copy) = 91.0 MB/s / 78.7 MB/s
memcpy_neon : (5 bytes copy) = 90.2 MB/s / 91.0 MB/s
memcpy_arm : (7 bytes copy) = 109.5 MB/s / 104.7 MB/s
memcpy_neon : (7 bytes copy) = 122.1 MB/s / 126.6 MB/s
memcpy_arm : (8 bytes copy) = 122.4 MB/s / 122.4 MB/s
memcpy_neon : (8 bytes copy) = 142.0 MB/s / 148.2 MB/s
memcpy_arm : (11 bytes copy) = 157.8 MB/s / 161.3 MB/s
memcpy_neon : (11 bytes copy) = 193.8 MB/s / 196.2 MB/s
memcpy_arm : (12 bytes copy) = 170.1 MB/s / 172.7 MB/s
memcpy_neon : (12 bytes copy) = 206.8 MB/s / 212.5 MB/s
memcpy_arm : (15 bytes copy) = 204.0 MB/s / 209.6 MB/s
memcpy_neon : (15 bytes copy) = 247.5 MB/s / 270.3 MB/s
memcpy_arm : (16 bytes copy) = 212.2 MB/s / 225.6 MB/s
memcpy_neon : (16 bytes copy) = 175.3 MB/s / 252.2 MB/s
memcpy_arm : (24 bytes copy) = 274.6 MB/s / 326.5 MB/s
memcpy_neon : (24 bytes copy) = 244.7 MB/s / 367.8 MB/s
memcpy_arm : (31 bytes copy) = 333.3 MB/s / 399.2 MB/s
memcpy_neon : (31 bytes copy) = 304.3 MB/s / 463.5 MB/s
L1 cached data:
memcpy_arm : (4096 bytes copy) = 1295.5 MB/s / 2691.8 MB/s
memcpy_neon : (4096 bytes copy) = 1826.3 MB/s / 2021.8 MB/s
memcpy_arm : (6144 bytes copy) = 1306.5 MB/s / 2724.1 MB/s
memcpy_neon : (6144 bytes copy) = 1857.8 MB/s / 2053.2 MB/s
L2 cached data:
memcpy_arm : (65536 bytes copy) = 1291.5 MB/s / 2304.8 MB/s
memcpy_neon : (65536 bytes copy) = 1866.5 MB/s / 2441.7 MB/s
memcpy_arm : (98304 bytes copy) = 1285.6 MB/s / 2283.8 MB/s
memcpy_neon : (98304 bytes copy) = 1860.7 MB/s / 2454.7 MB/s
SDRAM:
memcpy_arm : (2097152 bytes copy) = 466.7 MB/s / 736.5 MB/s
memcpy_neon : (2097152 bytes copy) = 727.5 MB/s / 868.8 MB/s
memcpy_arm : (3145728 bytes copy) = 507.9 MB/s / 854.7 MB/s
memcpy_neon : (3145728 bytes copy) = 852.9 MB/s / 1038.0 MB/s
(*) 1 MB = 1000000 bytes
(*) 'memcpy_arm' - an implementation for older ARM cores from glibc-ports
The similar benchmark is at
http://sourceware.org/ml/libc-ports/2009-07/msg00000.html .
Regards,
$4
On Thu, Apr 4, 2013 at 12:19 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Wed, Apr 03, 2013 at 11:47:36PM +0800, Shih-Yuan Lee (FourDollars) wrote:
>> Hi Joseph,
>>
> ...
>> > I was previously told by people at ARM that NEON memcpy wasn't a good idea
>> > in practice because of raised power consumption, context switch costs etc.
>> > from using NEON in processes that otherwise didn't use it, even if it
>> > appeared superficially beneficial in benchmarks.
>> >
>> About raised power consumption and context switch costs, I may be able
>> to add some option in configure for the users to decide if they want
>> to use this feature or not.
>> How do you think?
>>
> Configure option is bit overkill.
>
> You need to compare neon/other implementation speed. Then determine
> size where neon is faster if we include energy cost and context switch.
> My first estimate is use neon when larger than 4096 bytes.
>
> However to determine context switch cost of neon you must account network effect.
>
> If you use neon in one function that is called sufficiently often (to
> always save registers) then adding neon implementation for additional functions
> does not increase cost.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Patches] [PATCH] ARM: NEON detected memcpy.
2013-04-04 4:15 ` Shih-Yuan Lee (FourDollars)
@ 2013-04-04 6:37 ` Ondřej Bílka
2013-04-08 9:12 ` Will Newton
2013-04-09 8:45 ` Richard Earnshaw
1 sibling, 1 reply; 19+ messages in thread
From: Ondřej Bílka @ 2013-04-04 6:37 UTC (permalink / raw)
To: Shih-Yuan Lee (FourDollars)
Cc: Joseph S. Myers, libc-ports, Jesse Sung, patches, YC Cheng, rex.tsai
On Thu, Apr 04, 2013 at 12:15:17PM +0800, Shih-Yuan Lee (FourDollars) wrote:
> Hi Ondrej,
>
> I do have some benchmark data.
>
Hi,
Try also benchmark with real world data (20MB). I put it on
http://kam.mff.cuni.cz/~ondra/dryrun_memcpy.tar.bz2
To add neon copy test_generic.c file and add compiling neon
implementation to benchmark script.
It now only measures total time.
I would need something like timestamp counter for more detailed results.
> --- Running benchmarks (average case/perfect alignment case) ---
>
> very small data test:
> memcpy_arm : (3 bytes copy) = 86.2 MB/s / 88.3 MB/s
> memcpy_neon : (3 bytes copy) = 53.4 MB/s / 54.5 MB/s
> memcpy_arm : (4 bytes copy) = 79.8 MB/s / 62.9 MB/s
> memcpy_neon : (4 bytes copy) = 72.5 MB/s / 73.9 MB/s
> memcpy_arm : (5 bytes copy) = 91.0 MB/s / 78.7 MB/s
> memcpy_neon : (5 bytes copy) = 90.2 MB/s / 91.0 MB/s
> memcpy_arm : (7 bytes copy) = 109.5 MB/s / 104.7 MB/s
> memcpy_neon : (7 bytes copy) = 122.1 MB/s / 126.6 MB/s
> memcpy_arm : (8 bytes copy) = 122.4 MB/s / 122.4 MB/s
> memcpy_neon : (8 bytes copy) = 142.0 MB/s / 148.2 MB/s
> memcpy_arm : (11 bytes copy) = 157.8 MB/s / 161.3 MB/s
> memcpy_neon : (11 bytes copy) = 193.8 MB/s / 196.2 MB/s
> memcpy_arm : (12 bytes copy) = 170.1 MB/s / 172.7 MB/s
> memcpy_neon : (12 bytes copy) = 206.8 MB/s / 212.5 MB/s
> memcpy_arm : (15 bytes copy) = 204.0 MB/s / 209.6 MB/s
> memcpy_neon : (15 bytes copy) = 247.5 MB/s / 270.3 MB/s
> memcpy_arm : (16 bytes copy) = 212.2 MB/s / 225.6 MB/s
> memcpy_neon : (16 bytes copy) = 175.3 MB/s / 252.2 MB/s
> memcpy_arm : (24 bytes copy) = 274.6 MB/s / 326.5 MB/s
> memcpy_neon : (24 bytes copy) = 244.7 MB/s / 367.8 MB/s
> memcpy_arm : (31 bytes copy) = 333.3 MB/s / 399.2 MB/s
> memcpy_neon : (31 bytes copy) = 304.3 MB/s / 463.5 MB/s
>
> L1 cached data:
> memcpy_arm : (4096 bytes copy) = 1295.5 MB/s / 2691.8 MB/s
> memcpy_neon : (4096 bytes copy) = 1826.3 MB/s / 2021.8 MB/s
> memcpy_arm : (6144 bytes copy) = 1306.5 MB/s / 2724.1 MB/s
> memcpy_neon : (6144 bytes copy) = 1857.8 MB/s / 2053.2 MB/s
>
> L2 cached data:
> memcpy_arm : (65536 bytes copy) = 1291.5 MB/s / 2304.8 MB/s
> memcpy_neon : (65536 bytes copy) = 1866.5 MB/s / 2441.7 MB/s
> memcpy_arm : (98304 bytes copy) = 1285.6 MB/s / 2283.8 MB/s
> memcpy_neon : (98304 bytes copy) = 1860.7 MB/s / 2454.7 MB/s
>
> SDRAM:
> memcpy_arm : (2097152 bytes copy) = 466.7 MB/s / 736.5 MB/s
> memcpy_neon : (2097152 bytes copy) = 727.5 MB/s / 868.8 MB/s
> memcpy_arm : (3145728 bytes copy) = 507.9 MB/s / 854.7 MB/s
> memcpy_neon : (3145728 bytes copy) = 852.9 MB/s / 1038.0 MB/s
>
> (*) 1 MB = 1000000 bytes
> (*) 'memcpy_arm' - an implementation for older ARM cores from glibc-ports
>
> The similar benchmark is at
> http://sourceware.org/ml/libc-ports/2009-07/msg00000.html .
>
> Regards,
> $4
>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Patches] [PATCH] ARM: NEON detected memcpy.
2013-04-04 6:37 ` Ondřej Bílka
@ 2013-04-08 9:12 ` Will Newton
2013-04-08 10:27 ` Ondřej Bílka
0 siblings, 1 reply; 19+ messages in thread
From: Will Newton @ 2013-04-08 9:12 UTC (permalink / raw)
To: Ondřej Bílka
Cc: Shih-Yuan Lee (FourDollars),
Joseph S. Myers, libc-ports, Jesse Sung, patches, YC Cheng,
rex.tsai
On 4 April 2013 07:37, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Thu, Apr 04, 2013 at 12:15:17PM +0800, Shih-Yuan Lee (FourDollars) wrote:
>> Hi Ondrej,
>>
>> I do have some benchmark data.
>>
> Hi,
>
> Try also benchmark with real world data (20MB). I put it on
> http://kam.mff.cuni.cz/~ondra/dryrun_memcpy.tar.bz2
Hi Ondrej,
How was the workload chosen for this test run? Is it a known "memcpy
hot" workload?
Also it looks like the data was captured on x86_64? I suspect we
should use a specific data set for each architecture - the alignment
of data will change depending on the ABI alignment rules and different
compilers inline e.g. constant sized memcpys in different ways. Last
time I looked gcc seemed to be much more aggressive with inlining
string functions on x86 than arm for example.
Thanks,
--
Will Newton
Toolchain Working Group, Linaro
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Patches] [PATCH] ARM: NEON detected memcpy.
2013-04-08 9:12 ` Will Newton
@ 2013-04-08 10:27 ` Ondřej Bílka
0 siblings, 0 replies; 19+ messages in thread
From: Ondřej Bílka @ 2013-04-08 10:27 UTC (permalink / raw)
To: Will Newton
Cc: Shih-Yuan Lee (FourDollars),
Joseph S. Myers, libc-ports, Jesse Sung, patches, YC Cheng,
rex.tsai
On Mon, Apr 08, 2013 at 10:11:59AM +0100, Will Newton wrote:
> On 4 April 2013 07:37, OndÅej BÃlka <neleai@seznam.cz> wrote:
> > On Thu, Apr 04, 2013 at 12:15:17PM +0800, Shih-Yuan Lee (FourDollars) wrote:
> >> Hi Ondrej,
> >>
> >> I do have some benchmark data.
> >>
> > Hi,
> >
> > Try also benchmark with real world data (20MB). I put it on
> > http://kam.mff.cuni.cz/~ondra/dryrun_memcpy.tar.bz2
>
> Hi Ondrej,
>
> How was the workload chosen for this test run? Is it a known "memcpy
> hot" workload?
>
Collected during day of normal usage.
Majority of memcpy calls are hot, see how delay between calls are distributes in:
http://kam.mff.cuni.cz/~ondra/benchmark_string/profile/result.html
There more than 95% of calls is less than 2^15 = 32768 cycles from previous
call.
> Also it looks like the data was captured on x86_64? I suspect we
yes.
> should use a specific data set for each architecture - the alignment
> of data will change depending on the ABI alignment rules and different
> compilers inline e.g. constant sized memcpys in different ways. Last
> time I looked gcc seemed to be much more aggressive with inlining
> string functions on x86 than arm for example.
>
If you want capture data for arm do following:
rm record.rec # Otherwise you would append to x64 data.
make
# I did not test on arm so record for example make or anything other of interest.
LD_PRELOAD=./record.so make
# Then see if data are really recorded
./show #displays alignment and lengths of recorded data.
# Finally you can enably recording globaly by
echo $PWD/record.so >> /etc/ld.so.preload
> Thanks,
>
> --
> Will Newton
> Toolchain Working Group, Linaro
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [Patches] [PATCH] ARM: NEON detected memcpy.
2013-04-04 4:15 ` Shih-Yuan Lee (FourDollars)
2013-04-04 6:37 ` Ondřej Bílka
@ 2013-04-09 8:45 ` Richard Earnshaw
1 sibling, 0 replies; 19+ messages in thread
From: Richard Earnshaw @ 2013-04-09 8:45 UTC (permalink / raw)
To: Shih-Yuan Lee (FourDollars)
Cc: Ondřej Bílka, Joseph S. Myers, libc-ports, Jesse Sung,
patches, YC Cheng, rex.tsai
On 04/04/13 05:15, Shih-Yuan Lee (FourDollars) wrote:
> Hi Ondrej,
>
> I do have some benchmark data.
>
> --- Running benchmarks (average case/perfect alignment case) ---
>
> very small data test:
> memcpy_arm : (3 bytes copy) = 86.2 MB/s / 88.3 MB/s
> memcpy_neon : (3 bytes copy) = 53.4 MB/s / 54.5 MB/s
> memcpy_arm : (4 bytes copy) = 79.8 MB/s / 62.9 MB/s
> memcpy_neon : (4 bytes copy) = 72.5 MB/s / 73.9 MB/s
> memcpy_arm : (5 bytes copy) = 91.0 MB/s / 78.7 MB/s
> memcpy_neon : (5 bytes copy) = 90.2 MB/s / 91.0 MB/s
> memcpy_arm : (7 bytes copy) = 109.5 MB/s / 104.7 MB/s
> memcpy_neon : (7 bytes copy) = 122.1 MB/s / 126.6 MB/s
> memcpy_arm : (8 bytes copy) = 122.4 MB/s / 122.4 MB/s
> memcpy_neon : (8 bytes copy) = 142.0 MB/s / 148.2 MB/s
> memcpy_arm : (11 bytes copy) = 157.8 MB/s / 161.3 MB/s
> memcpy_neon : (11 bytes copy) = 193.8 MB/s / 196.2 MB/s
> memcpy_arm : (12 bytes copy) = 170.1 MB/s / 172.7 MB/s
> memcpy_neon : (12 bytes copy) = 206.8 MB/s / 212.5 MB/s
> memcpy_arm : (15 bytes copy) = 204.0 MB/s / 209.6 MB/s
> memcpy_neon : (15 bytes copy) = 247.5 MB/s / 270.3 MB/s
> memcpy_arm : (16 bytes copy) = 212.2 MB/s / 225.6 MB/s
> memcpy_neon : (16 bytes copy) = 175.3 MB/s / 252.2 MB/s
> memcpy_arm : (24 bytes copy) = 274.6 MB/s / 326.5 MB/s
> memcpy_neon : (24 bytes copy) = 244.7 MB/s / 367.8 MB/s
> memcpy_arm : (31 bytes copy) = 333.3 MB/s / 399.2 MB/s
> memcpy_neon : (31 bytes copy) = 304.3 MB/s / 463.5 MB/s
>
> L1 cached data:
> memcpy_arm : (4096 bytes copy) = 1295.5 MB/s / 2691.8 MB/s
> memcpy_neon : (4096 bytes copy) = 1826.3 MB/s / 2021.8 MB/s
> memcpy_arm : (6144 bytes copy) = 1306.5 MB/s / 2724.1 MB/s
> memcpy_neon : (6144 bytes copy) = 1857.8 MB/s / 2053.2 MB/s
>
> L2 cached data:
> memcpy_arm : (65536 bytes copy) = 1291.5 MB/s / 2304.8 MB/s
> memcpy_neon : (65536 bytes copy) = 1866.5 MB/s / 2441.7 MB/s
> memcpy_arm : (98304 bytes copy) = 1285.6 MB/s / 2283.8 MB/s
> memcpy_neon : (98304 bytes copy) = 1860.7 MB/s / 2454.7 MB/s
>
> SDRAM:
> memcpy_arm : (2097152 bytes copy) = 466.7 MB/s / 736.5 MB/s
> memcpy_neon : (2097152 bytes copy) = 727.5 MB/s / 868.8 MB/s
> memcpy_arm : (3145728 bytes copy) = 507.9 MB/s / 854.7 MB/s
> memcpy_neon : (3145728 bytes copy) = 852.9 MB/s / 1038.0 MB/s
>
> (*) 1 MB = 1000000 bytes
> (*) 'memcpy_arm' - an implementation for older ARM cores from glibc-ports
You don't say what this is measured on. Without knowing the hardware
it's impossible to really argue whether this is generally a good thing
or not.
R.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-03 15:08 ` Joseph S. Myers
2013-04-03 15:48 ` Shih-Yuan Lee (FourDollars)
@ 2013-04-09 9:05 ` Richard Earnshaw
2013-04-09 12:04 ` Ondřej Bílka
2013-04-09 12:59 ` Carlos O'Donell
1 sibling, 2 replies; 19+ messages in thread
From: Richard Earnshaw @ 2013-04-09 9:05 UTC (permalink / raw)
To: Joseph S. Myers
Cc: Shih-Yuan Lee (FourDollars),
patches, libc-ports, rex.tsai, jesse.sung, yc.cheng,
Shih-Yuan Lee
On 03/04/13 16:08, Joseph S. Myers wrote:
> I was previously told by people at ARM that NEON memcpy wasn't a good idea
> in practice because of raised power consumption, context switch costs etc.
> from using NEON in processes that otherwise didn't use it, even if it
> appeared superficially beneficial in benchmarks.
What really matters is system power increase vs performance gain and
what you might be able to save if you finish sooner. If a 10%
improvement to memcpy performance comes at a 12% increase in CPU power,
then that might seem like a net loss. But if the CPU is only 50% of the
system power, then the increase in system power increase is just half of
that (ie 6%), but the performance improvement will still be 10%. Note
that 20% is just an example to make the figures easier here, I've no
idea what the real numbers are, and they will be hightly dependent on
the other components in the system: a back-lit display, in particular,
will use a significant amount of power.
It's also necessary to think about how the Neon unit in the processor is
managed. Is it power gated or simply clock gated. Power gated regions
are likely to have long power-up times (relative to normal CPU
operations), but clock-gated regions are typically instantaneously
available.
Finally, you need to consider whether the unit is likely to be already
in use. With the increasing trend to using the hard-float ABI, VFP (and
Neon) are generally much more widely used in code now than they were, so
the other potential cost of using Neon (lazy context switching) is also
likely to be a non-issue, than if the unit is almost never touched.
R.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-09 9:05 ` Richard Earnshaw
@ 2013-04-09 12:04 ` Ondřej Bílka
2013-04-09 12:59 ` Carlos O'Donell
1 sibling, 0 replies; 19+ messages in thread
From: Ondřej Bílka @ 2013-04-09 12:04 UTC (permalink / raw)
To: Richard Earnshaw
Cc: Joseph S. Myers, Shih-Yuan Lee (FourDollars),
patches, libc-ports, rex.tsai, jesse.sung, yc.cheng,
Shih-Yuan Lee
On Tue, Apr 09, 2013 at 10:04:56AM +0100, Richard Earnshaw wrote:
> On 03/04/13 16:08, Joseph S. Myers wrote:
> >I was previously told by people at ARM that NEON memcpy wasn't a good idea
> >in practice because of raised power consumption, context switch costs etc.
> >from using NEON in processes that otherwise didn't use it, even if it
> >appeared superficially beneficial in benchmarks.
>
> What really matters is system power increase vs performance gain and
> what you might be able to save if you finish sooner. If a 10%
> improvement to memcpy performance comes at a 12% increase in CPU
> power, then that might seem like a net loss. But if the CPU is only
> 50% of the system power, then the increase in system power increase
> is just half of that (ie 6%), but the performance improvement will
> still be 10%. Note that 20% is just an example to make the figures
> easier here, I've no idea what the real numbers are, and they will
> be hightly dependent on the other components in the system: a
> back-lit display, in particular, will use a significant amount of
> power.
>
I did say similar thing. I also added treshold idea.
From my previous mail:
"
You need to compare neon/other implementation speed. Then determine
size where neon is faster if we include energy cost and context switch.
My first estimate is use neon when larger than 4096 bytes.
However to determine context switch cost of neon you must account
network effect.
If you use neon in one function that is called sufficiently often (to
always save registers) then adding neon implementation for additional
functions
does not increase cost.
"
Ondra
> It's also necessary to think about how the Neon unit in the
> processor is managed. Is it power gated or simply clock gated.
> Power gated regions are likely to have long power-up times (relative
> to normal CPU operations), but clock-gated regions are typically
> instantaneously available.
>
> Finally, you need to consider whether the unit is likely to be
> already in use. With the increasing trend to using the hard-float
> ABI, VFP (and Neon) are generally much more widely used in code now
> than they were, so the other potential cost of using Neon (lazy
> context switching) is also likely to be a non-issue, than if the
> unit is almost never touched.
>
memcpy is after strcmp second most often called string function.
> R.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-09 9:05 ` Richard Earnshaw
2013-04-09 12:04 ` Ondřej Bílka
@ 2013-04-09 12:59 ` Carlos O'Donell
2013-04-09 15:00 ` Richard Earnshaw
1 sibling, 1 reply; 19+ messages in thread
From: Carlos O'Donell @ 2013-04-09 12:59 UTC (permalink / raw)
To: Richard Earnshaw
Cc: Joseph S. Myers, Shih-Yuan Lee (FourDollars),
patches, libc-ports, rex.tsai, jesse.sung, yc.cheng,
Shih-Yuan Lee
On 04/09/2013 05:04 AM, Richard Earnshaw wrote:
> On 03/04/13 16:08, Joseph S. Myers wrote:
>> I was previously told by people at ARM that NEON memcpy wasn't a good idea
>> in practice because of raised power consumption, context switch costs etc.
>> from using NEON in processes that otherwise didn't use it, even if it
>> appeared superficially beneficial in benchmarks.
>
> What really matters is system power increase vs performance gain and
> what you might be able to save if you finish sooner. If a 10%
> improvement to memcpy performance comes at a 12% increase in CPU
> power, then that might seem like a net loss. But if the CPU is only
> 50% of the system power, then the increase in system power increase
> is just half of that (ie 6%), but the performance improvement will
> still be 10%. Note that 20% is just an example to make the figures
> easier here, I've no idea what the real numbers are, and they will be
> hightly dependent on the other components in the system: a back-lit
> display, in particular, will use a significant amount of power.
>
> It's also necessary to think about how the Neon unit in the processor
> is managed. Is it power gated or simply clock gated. Power gated
> regions are likely to have long power-up times (relative to normal
> CPU operations), but clock-gated regions are typically
> instantaneously available.
>
> Finally, you need to consider whether the unit is likely to be
> already in use. With the increasing trend to using the hard-float
> ABI, VFP (and Neon) are generally much more widely used in code now
> than they were, so the other potential cost of using Neon (lazy
> context switching) is also likely to be a non-issue, than if the unit
> is almost never touched.
My expectation here is that downstream integrators run the
glibc microbenchmarks, or their own benchmarks, measure power,
and engage the community to discuss alternate runtime tunings
for their systems.
The project lacks any generalized whole-system benchmarking,
but my opinion is that microbenchmarks are the best "first step"
towards achieving measurable performance goals (since whole-system
benchmarking is much more complicated).
At present the only policy we have as a community is that faster
is always better.
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-09 12:59 ` Carlos O'Donell
@ 2013-04-09 15:00 ` Richard Earnshaw
2013-04-09 15:54 ` Ondřej Bílka
2013-04-09 15:59 ` Carlos O'Donell
0 siblings, 2 replies; 19+ messages in thread
From: Richard Earnshaw @ 2013-04-09 15:00 UTC (permalink / raw)
To: Carlos O'Donell
Cc: Joseph S. Myers, Shih-Yuan Lee (FourDollars),
patches, libc-ports, rex.tsai, jesse.sung, yc.cheng,
Shih-Yuan Lee
On 09/04/13 13:58, Carlos O'Donell wrote:
> On 04/09/2013 05:04 AM, Richard Earnshaw wrote:
>> On 03/04/13 16:08, Joseph S. Myers wrote:
>>> I was previously told by people at ARM that NEON memcpy wasn't a good idea
>>> in practice because of raised power consumption, context switch costs etc.
>>> from using NEON in processes that otherwise didn't use it, even if it
>>> appeared superficially beneficial in benchmarks.
>>
>> What really matters is system power increase vs performance gain and
>> what you might be able to save if you finish sooner. If a 10%
>> improvement to memcpy performance comes at a 12% increase in CPU
>> power, then that might seem like a net loss. But if the CPU is only
>> 50% of the system power, then the increase in system power increase
>> is just half of that (ie 6%), but the performance improvement will
>> still be 10%. Note that 20% is just an example to make the figures
>> easier here, I've no idea what the real numbers are, and they will be
>> hightly dependent on the other components in the system: a back-lit
>> display, in particular, will use a significant amount of power.
>>
>> It's also necessary to think about how the Neon unit in the processor
>> is managed. Is it power gated or simply clock gated. Power gated
>> regions are likely to have long power-up times (relative to normal
>> CPU operations), but clock-gated regions are typically
>> instantaneously available.
>>
>> Finally, you need to consider whether the unit is likely to be
>> already in use. With the increasing trend to using the hard-float
>> ABI, VFP (and Neon) are generally much more widely used in code now
>> than they were, so the other potential cost of using Neon (lazy
>> context switching) is also likely to be a non-issue, than if the unit
>> is almost never touched.
>
> My expectation here is that downstream integrators run the
> glibc microbenchmarks, or their own benchmarks, measure power,
> and engage the community to discuss alternate runtime tunings
> for their systems.
>
> The project lacks any generalized whole-system benchmarking,
> but my opinion is that microbenchmarks are the best "first step"
> towards achieving measurable performance goals (since whole-system
> benchmarking is much more complicated).
>
> At present the only policy we have as a community is that faster
> is always better.
You still have to be careful how you measure 'faster'. Repeatedly
running the same fragment of code under the same boundary conditions
will only ever give you the 'warm caches' number (I, D and branch
target), but if the code is called cold (or with different boundary
conditions in the case of the Branch target cache) most of the time in
real life, that's unlikely to be very meaningful.
R.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-09 15:00 ` Richard Earnshaw
@ 2013-04-09 15:54 ` Ondřej Bílka
2013-04-09 15:59 ` Carlos O'Donell
1 sibling, 0 replies; 19+ messages in thread
From: Ondřej Bílka @ 2013-04-09 15:54 UTC (permalink / raw)
To: Richard Earnshaw
Cc: Carlos O'Donell, Joseph S. Myers, Shih-Yuan Lee (FourDollars),
patches, libc-ports, rex.tsai, jesse.sung, yc.cheng,
Shih-Yuan Lee
On Tue, Apr 09, 2013 at 04:00:03PM +0100, Richard Earnshaw wrote:
> On 09/04/13 13:58, Carlos O'Donell wrote:
> >On 04/09/2013 05:04 AM, Richard Earnshaw wrote:
> >>On 03/04/13 16:08, Joseph S. Myers wrote:
> >>>I was previously told by people at ARM that NEON memcpy wasn't a good idea
> >>>in practice because of raised power consumption, context switch costs etc.
> >>>from using NEON in processes that otherwise didn't use it, even if it
> >>>appeared superficially beneficial in benchmarks.
> >>
> >>What really matters is system power increase vs performance gain and
> >>what you might be able to save if you finish sooner. If a 10%
> >>improvement to memcpy performance comes at a 12% increase in CPU
> >>power, then that might seem like a net loss. But if the CPU is only
> >>50% of the system power, then the increase in system power increase
> >>is just half of that (ie 6%), but the performance improvement will
> >>still be 10%. Note that 20% is just an example to make the figures
> >>easier here, I've no idea what the real numbers are, and they will be
> >>hightly dependent on the other components in the system: a back-lit
> >>display, in particular, will use a significant amount of power.
> >>
> >>It's also necessary to think about how the Neon unit in the processor
> >>is managed. Is it power gated or simply clock gated. Power gated
> >>regions are likely to have long power-up times (relative to normal
> >>CPU operations), but clock-gated regions are typically
> >>instantaneously available.
> >>
> >>Finally, you need to consider whether the unit is likely to be
> >>already in use. With the increasing trend to using the hard-float
> >>ABI, VFP (and Neon) are generally much more widely used in code now
> >>than they were, so the other potential cost of using Neon (lazy
> >>context switching) is also likely to be a non-issue, than if the unit
> >>is almost never touched.
> >
> >My expectation here is that downstream integrators run the
> >glibc microbenchmarks, or their own benchmarks, measure power,
> >and engage the community to discuss alternate runtime tunings
> >for their systems.
> >
> >The project lacks any generalized whole-system benchmarking,
> >but my opinion is that microbenchmarks are the best "first step"
> >towards achieving measurable performance goals (since whole-system
> >benchmarking is much more complicated).
> >
> >At present the only policy we have as a community is that faster
> >is always better.
>
I am rewriting my whole-system benchmarks to be more generic.
Still measuring performance would be time consuming, benchmarks needs
minimaly hour to get enough data.
Then I cannot replicate exact conditions of measurement. It depends on
what I do with computer which varies.
There is problem with representability. I know how conditions for
popular programs (gcc, firefox) Most other programs show very similar
characteristic but I do not know anything about tail.
To get more direct feedback I also do record/replay benchmark, see my
previous mail.
>
> You still have to be careful how you measure 'faster'. Repeatedly
> running the same fragment of code under the same boundary conditions
> will only ever give you the 'warm caches' number (I, D and branch
> target), but if the code is called cold (or with different boundary
> conditions in the case of the Branch target cache) most of the time
> in real life, that's unlikely to be very meaningful.
>
> R.
>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] ARM: NEON detected memcpy.
2013-04-09 15:00 ` Richard Earnshaw
2013-04-09 15:54 ` Ondřej Bílka
@ 2013-04-09 15:59 ` Carlos O'Donell
1 sibling, 0 replies; 19+ messages in thread
From: Carlos O'Donell @ 2013-04-09 15:59 UTC (permalink / raw)
To: Richard Earnshaw
Cc: Joseph S. Myers, Shih-Yuan Lee (FourDollars),
patches, libc-ports, rex.tsai, jesse.sung, yc.cheng,
Shih-Yuan Lee
On 04/09/2013 11:00 AM, Richard Earnshaw wrote:
>> At present the only policy we have as a community is that faster is
>> always better.
>
>
> You still have to be careful how you measure 'faster'. Repeatedly
> running the same fragment of code under the same boundary conditions
> will only ever give you the 'warm caches' number (I, D and branch
> target), but if the code is called cold (or with different boundary
> conditions in the case of the Branch target cache) most of the time
> in real life, that's unlikely to be very meaningful.
Agreed, but that's what whole system benchmarking is for.
We can't solve all problems at once, and we had zero benchmarking
before we started this work for 2.18.
Hopefully by 2.20 or 2.21 we have some kind of whole system
benchmarking that allows users to monitor their system, gather
data, and submit it back to the project for analysis.
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2013-04-09 15:59 UTC | newest]
Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-04-03 7:58 [PATCH] ARM: NEON detected memcpy Shih-Yuan Lee (FourDollars)
2013-04-03 8:15 ` Will Newton
2013-04-03 9:19 ` Ondřej Bílka
2013-04-03 15:08 ` Joseph S. Myers
2013-04-03 15:48 ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:02 ` Joseph S. Myers
2013-04-04 3:56 ` Shih-Yuan Lee (FourDollars)
2013-04-03 16:20 ` [Patches] " Ondřej Bílka
2013-04-04 4:15 ` Shih-Yuan Lee (FourDollars)
2013-04-04 6:37 ` Ondřej Bílka
2013-04-08 9:12 ` Will Newton
2013-04-08 10:27 ` Ondřej Bílka
2013-04-09 8:45 ` Richard Earnshaw
2013-04-09 9:05 ` Richard Earnshaw
2013-04-09 12:04 ` Ondřej Bílka
2013-04-09 12:59 ` Carlos O'Donell
2013-04-09 15:00 ` Richard Earnshaw
2013-04-09 15:54 ` Ondřej Bílka
2013-04-09 15:59 ` Carlos O'Donell
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).