public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [Patch] aarch64: Thunderx specific memcpy and memmove
@ 2017-03-24 23:25 Steve Ellcey
  2017-03-27 10:45 ` Szabolcs Nagy
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Steve Ellcey @ 2017-03-24 23:25 UTC (permalink / raw)
  To: libc-alpha; +Cc: Siddhesh Poyarekar, Adhemerval Zanella

[-- Attachment #1: Type: text/plain, Size: 1848 bytes --]

Now that the IFUNC infrastructure for aarch64 is in place, here is a
patch to use it to create ThunderX specific versions of memcpy and
memmove.

This was part of my original patch before it was split in two and a
couple of issues were raised at that time. 

Siddhesh Poyarekar wanted to separate the generic and thunderx copies
of memcpy/memmove instead of using ifdefs in a combined source file.
I prefer the ifdef version as a cleaner implementation with less code
duplication but I can change it if that is the consensus.

Also Adhemerval Zanella did some benchmarking that showed the
prefetching done in the thunderx version might be appropriate for the
generic version.  However if you look at the prefetching we only do it
every other time through the loop.  This is because the loop copies 64
bytes and the ThunderX cache line size is 128 bytes.  If other aarch64
chips have a 64 byte cache line they might want a different prefetching
setup.

If people think we should use the ThunderX version of memcpy for all
aarch64 systems I am happy to drop this patch and create one that just
changes memcpy.S to do the ThunderX style prefetches for all aarch64
systems.

Steve Ellcey
sellcey@cavium.com


2017-03-24  Steve Ellcey  <sellcey@caviumnetworks.com>

	* sysdeps/aarch64/memcpy.S (MEMMOVE, MEMCPY): New macros.
	(memmove): Use MEMMOVE for name.
	(memcpy): Use MEMCPY for name.  Add loop with prefetching
	under USE_THUNDERX macro.
	* sysdeps/aarch64/multiarch/Makefile: New file.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c: Likewise.
	* sysdeps/aarch64/multiarch/init-arch.h: Likewise.
	* sysdeps/aarch64/multiarch/memcpy.c: Likewise.
	* sysdeps/aarch64/multiarch/memcpy_generic.S: Likewise.
	* sysdeps/aarch64/multiarch/memcpy_thunderx.S: Likewise.
	* sysdeps/aarch64/multiarch/memmove.c: Likewise.

[-- Attachment #2: ifunc.patch --]
[-- Type: text/x-patch, Size: 13342 bytes --]

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 29af8b1..74444b4 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -59,7 +59,14 @@
    Overlapping large forward memmoves use a loop that copies backwards.
 */
 
-ENTRY_ALIGN (memmove, 6)
+#ifndef MEMMOVE
+#  define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+#  define MEMCPY memcpy
+#endif
+
+ENTRY_ALIGN (MEMMOVE, 6)
 
 	DELOUSE (0)
 	DELOUSE (1)
@@ -71,9 +78,9 @@ ENTRY_ALIGN (memmove, 6)
 	b.lo	L(move_long)
 
 	/* Common case falls through into memcpy.  */
-END (memmove)
-libc_hidden_builtin_def (memmove)
-ENTRY (memcpy)
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+ENTRY (MEMCPY)
 
 	DELOUSE (0)
 	DELOUSE (1)
@@ -158,10 +165,22 @@ L(copy96):
 
 	.p2align 4
 L(copy_long):
+
+#ifdef USE_THUNDERX
+
+	/* On thunderx, large memcpy's are helped by software prefetching.
+	   This loop is identical to the one below it but with prefetching
+	   instructions included.  For loops that are less than 32768 bytes,
+	   the prefetching does not help and slow the code down so we only
+	   use the prefetching loop for the largest memcpys.  */
+
+	cmp	count, #32768
+	b.lo	L(copy_long_without_prefetch)
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
+	prfm	pldl1strm, [src, 384]
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
@@ -169,7 +188,10 @@ L(copy_long):
 	ldp	C_l, C_h, [src, 48]
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	2f
+
+L(prefetch_loop64):
+	tbz	src, #6, 1f
+	prfm	pldl1strm, [src, 512]
 1:
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
@@ -180,12 +202,40 @@ L(copy_long):
 	stp	D_l, D_h, [dst, 64]!
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 64
-	b.hi	1b
+	b.hi	L(prefetch_loop64)
+	b	L(last64)
+
+L(copy_long_without_prefetch):
+#endif
+
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
 
 	/* Write the last full set of 64 bytes.  The remainder is at most 64
 	   bytes, so it is safe to always copy 64 bytes from the end even if
 	   there is just 1 byte left.  */
-2:
+L(last64):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [srcend, -48]
@@ -256,5 +306,5 @@ L(move_long):
 	stp	C_l, C_h, [dstin]
 3:	ret
 
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index e69de29..78d52c7 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_generic memcpy_thunderx
+endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index e69de29..c4f23df 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,51 @@
+/* Enumerate available IFUNC implementations of a function.  AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <ifunc-impl-list.h>
+#include <init-arch.h>
+#include <stdio.h>
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	2
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  INIT_ARCH ();
+
+  /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
+			      __memcpy_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX (midr),
+			      __memmove_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+
+  return i;
+}
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index e69de29..e690e00 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -0,0 +1,22 @@
+/* This file is part of the GNU C Library.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+
+#define INIT_ARCH()				\
+  uint64_t __attribute__((unused)) midr =	\
+    GLRO(dl_aarch64_cpu_features).midr_el1;
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index e69de29..4e3f251 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -0,0 +1,39 @@
+/* Multiple versions of memcpy. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memcpy so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memcpy
+# define memcpy __redirect_memcpy
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memcpy) __libc_memcpy;
+
+extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+
+libc_ifunc (__libc_memcpy,
+            IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic);
+
+#undef memcpy
+strong_alias (__libc_memcpy, memcpy);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S
index e69de29..50e1a1c 100644
--- a/sysdeps/aarch64/multiarch/memcpy_generic.S
+++ b/sysdeps/aarch64/multiarch/memcpy_generic.S
@@ -0,0 +1,42 @@
+/* A Generic Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual memcpy and memmove code is in ../memcpy.S.  If we are
+   building libc this file defines __memcpy_generic and __memmove_generic.
+   Otherwise the include of ../memcpy.S will define the normal __memcpy
+   and__memmove entry points.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+#define MEMCPY __memcpy_generic
+#define MEMMOVE __memmove_generic
+
+/* Do not hide the generic versions of memcpy and memmove, we use them
+   internally.  */
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+/* It doesn't make sense to send libc-internal memcpy calls through a PLT. */
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_generic
+	.globl __GI_memmove; __GI_memmove = __memmove_generic
+
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index e69de29..ee971c8 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -0,0 +1,32 @@
+/* A Thunderx Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual thunderx optimized code is in ../memcpy.S under the USE_THUNDERX
+   ifdef.  If we are not building libc then we do not build anything when
+   compiling this file and __memcpy is defined by memcpy_generic.S.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+#define MEMCPY __memcpy_thunderx
+#define MEMMOVE __memmove_thunderx
+#define USE_THUNDERX
+#include "../memcpy.S"
+
+#endif
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index e69de29..8d7a146 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -0,0 +1,39 @@
+/* Multiple versions of memmove. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memmove so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memmove
+# define memmove __redirect_memmove
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memmove) __libc_memmove;
+
+extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+
+libc_ifunc (__libc_memmove,
+            IS_THUNDERX (midr) ? __memmove_thunderx : __memmove_generic);
+
+#undef memmove
+strong_alias (__libc_memmove, memmove);
+#endif

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-05-02  3:51 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-24 23:25 [Patch] aarch64: Thunderx specific memcpy and memmove Steve Ellcey
2017-03-27 10:45 ` Szabolcs Nagy
2017-03-27 10:52 ` Ramana Radhakrishnan
2017-03-27 21:35   ` Steve Ellcey
2017-04-02  0:02 ` Wainer dos Santos Moschetta
2017-04-06 20:48   ` Steve Ellcey
2017-05-02  3:51 ` Siddhesh Poyarekar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).