public inbox for libc-hacker@sourceware.org
 help / color / mirror / Atom feed
* SH: optimized memcpy/memset
@ 2002-07-05  1:33 kaz Kojima
  2002-07-09 23:33 ` Ulrich Drepper
  0 siblings, 1 reply; 5+ messages in thread
From: kaz Kojima @ 2002-07-05  1:33 UTC (permalink / raw)
  To: libc-hacker

Hi,

This patch gives an optimized memcpy/memset for SH. The original patch
was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.

	kaz
--
2002-07-05 Kaz Kojima  <kkojima@rr.iij4u.or.jp>

	* sysdeps/sh/memcpy.S: Optimize. Based on a patch by Toshiyasu
	Morita <toshiyasu.morita@hsa.hitachi.com>.
	* sysdeps/sh/memcpy.S: Likewise.

Index: memcpy.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/sh/memcpy.S,v
retrieving revision 1.2
diff -u -r1.2 memcpy.S
--- memcpy.S	6 Jul 2001 04:56:03 -0000	1.2
+++ memcpy.S	5 Jul 2002 08:07:17 -0000
@@ -1,5 +1,7 @@
-/* Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1999, 2000, 2002 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
+   Contributed by Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
+   Optimized by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -19,213 +21,179 @@
 #include <sysdep.h>
 #include <endian.h>
 
-/*
- * void *memcpy(void *dst, const void *src, size_t n);
- * No overlap between the memory of DST and of SRC are assumed.
- */
+/* void *memcpy(void *dst, const void *src, size_t n);
+    No overlap between the memory of DST and of SRC are assumed.  */
 
 ENTRY(memcpy)
-	tst	r6,r6
-	bt/s	1f
-	mov	r4,r0
-	mov	#12,r1
-	cmp/gt	r6,r1
-	bf	2f
-0:
-	mov.b	@r5+,r1
-	dt	r6
+	mov	r4,r3		/* Save destination.  */
+
+	/* If less than 11 bytes, just do a byte copy.  */
+	mov	#11,r0
+	cmp/gt	r6,r0
+	bt	L_byteloop_init
+
+	/* Check if we need to word-align source.  */
+	mov	r5,r0
+	tst	#1,r0
+	bt	L_wordalign
+
+	mov.b	@r0+,r1		/* Copy one byte.  */
+	add	#-1,r6
 	mov.b	r1,@r4
-	bf/s	0b
 	add	#1,r4
-1:
-	rts
-	nop
-2:	
-	mov.l	r8,@-r15
-	mov.l	r9,@-r15
-	mov	r6,r2
-	mov.l	r10,@-r15
-	mov.l	r11,@-r15
-	mov.l	r14,@-r15
-	mov	r4,r11
-	mov	r15,r14
-	mov	r5,r0
-	and	#1,r0
-	tst	r0,r0
-	bt/s	.L42
-	mov	r5,r0
-	mov.b	@r5+,r1
-	add	#-1,r2
+
+	.balignw 4,0x0009
+L_wordalign:
+	/* Check if we need to longword-align source.  */
+	tst	#2,r0
+	bt	L_copy
+
+	mov.w	@r0+,r1		/* Copy one word.  */
+	add	#-2,r6
+#if __BYTE_ORDER == __BIG_ENDIAN
 	add	#1,r4
-	mov.b	r1,@r11
-	mov	r5,r0
-.L42:
-	and	#2,r0
-	tst	r0,r0
-	bt/s	.L43
-	mov	r4,r0
-	mov.b	@r5+,r1
 	mov.b	r1,@r4
-	mov.b	@r5+,r1
+	shlr8	r1
+	mov.b	r1,@-r4
+	add	#2,r4
+#else
+	mov.b	r1,@r4
 	add	#1,r4
-	add	#-2,r2
+	shlr8	r1
 	mov.b	r1,@r4
 	add	#1,r4
+#endif
+L_copy:
+	mov	r0,r5
+
+	/* Calculate the correct routine to handle the destination
+	   alignment and simultaneously calculate the loop counts for
+	   both the 2 word copy loop and byte copy loop.  */
+	mova	L_jumptable,r0
+	mov	r0,r1
 	mov	r4,r0
-.L43:
-	and	#1,r0
-	tst	r0,r0
-	bf/s	.L38
-	mov	r4,r0
-	and	#2,r0
-	tst	r0,r0
-	bf/s	.L7
-	mov	r2,r0
-	shlr2	r0
+	mov	r6,r7
 	and	#3,r0
-	cmp/eq	#2,r0
-	bt/s	.L10
-	mov	#2,r1
-	cmp/gt	r1,r0
-	bt/s	.L14
-	cmp/eq	#3,r0
-	cmp/eq	#1,r0
-	bt/s	.L11
-	mov	r0,r1
-	bra	.L44
-	shll2	r1
-	.align 5
-.L14:
-	bf	.L8
-	mov.l	@(8,r5),r1
-	mov.l	r1,@(8,r4)
-.L10:
-	mov.l	@(4,r5),r1
-	mov.l	r1,@(4,r4)
-.L11:
-	mov.l	@r5,r1
-	mov.l	r1,@r4
-.L8:
-	mov	r0,r1
-	shll2	r1
-.L44:
-	add	r1,r4
-	add	r1,r5
-	mov	r2,r0
-	mov	#-4,r1
-	shad	r1,r0
-	mov	#3,r6
-	bra	.L37
-	and	r2,r6
-	.align 5
-.L18:
+	shlr2	r7
+	shll	r0
+	shlr	r7
+	mov.w	@(r0,r1),r2
+	mov	#7,r0
+	braf	r2
+	and	r0,r6
+L_base:
+
+	.balign	4
+L_jumptable:
+	.word	L_copydest0 - L_base
+	.word	L_copydest1_or_3 - L_base
+	.word	L_copydest2 - L_base
+	.word	L_copydest1_or_3 - L_base
+
+	.balign	4
+	/* Copy routine for (dest mod 4) == 1 or == 3.  */
+L_copydest1_or_3:
+	add	#-1,r4
+	.balignw 4,0x0009
+L_copydest1_or_3_loop:
+	mov.l	@r5+,r0		/* Read first longword.  */
+	dt	r7
+	mov.l	@r5+,r1		/* Read second longword.  */
+#if __BYTE_ORDER == __BIG_ENDIAN
+	/* Write first longword as byte, word, byte.  */
+	mov.b	r0,@(4,r4)
+	shlr8	r0
+	mov.w	r0,@(2,r4)
+	shlr16	r0
+	mov.b	r0,@(1,r4)
+	mov	r1,r0
+	/* Write second longword as byte, word, byte.  */
+	mov.b	r0,@(8,r4)
+	shlr8	r0
+	mov.w	r0,@(6,r4)
+	shlr16	r0
+	mov.b	r0,@(5,r4)
+#else
+	/* Write first longword as byte, word, byte.  */
+	mov.b	r0,@(1,r4)
+	shlr8	r0
+	mov.w	r0,@(2,r4)
+	shlr16	r0
+	mov.b	r0,@(4,r4)
+	mov	r1,r0
+	/* Write second longword as byte, word, byte.  */
+	mov.b	r0,@(5,r4)
+	shlr8	r0
+	mov.w	r0,@(6,r4)
+	shlr16	r0
+	mov.b	r0,@(8,r4)
+#endif
+	bf/s	L_copydest1_or_3_loop
+	add	#8,r4
+
+	bra	L_byteloop_init
+	add	#1,r4
+
+	.balign 4
+	/* Copy routine for (dest mod 4) == 2.  */
+L_copydest2:
+L_copydest2_loop:
+	mov.l	@r5+,r0
+	dt	r7
 	mov.l	@r5+,r1
-	mov.l	@r5+,r2
-	mov.l	@r5+,r3
-	mov.l	@r5+,r7
-	mov.l	r1,@r4
-	mov.l	r2,@(4,r4)
-	mov.l	r3,@(8,r4)
-	mov.l	r7,@(12,r4)
-	add	#16,r4
-.L37:
-	cmp/pl	r0
-	bt/s	.L18
-	add	#-1,r0
-	mov	r6,r2
-.L38:
-	bra	.L40
-	mov	r2,r0
-	.align 5
-.L7:
-	shar	r0
-	and	#3,r0
-	cmp/eq	#2,r0
-	bt/s	.L23
-	mov	#2,r1
-	cmp/gt	r1,r0
-	bt/s	.L27
-	cmp/eq	#3,r0
-	cmp/eq	#1,r0
-	bt/s	.L24
-	mov	r0,r1
-	bra	.L45
-	add	r0,r1
-	.align 5
-.L27:
-	bf	.L21
-	add	#4,r5
-	mov.w	@r5,r1
-	add	#4,r4
-	mov.w	r1,@r4
-	add	#-4,r5
-	add	#-4,r4
-.L23:
-	add	#2,r5
-	mov.w	@r5,r1
-	add	#2,r4
-	mov.w	r1,@r4
-	add	#-2,r5
-	add	#-2,r4
-.L24:
-	mov.w	@r5,r1
-	mov.w	r1,@r4
-.L21:
-	mov	r0,r1
-	add	r0,r1
-.L45:
-	add	r1,r4
-	add	r1,r5
-	mov	r2,r0
-	mov	#-3,r1
-	shad	r1,r0
-	mov	#1,r10
-	mov	r0,r1
-	and	r2,r10
-	cmp/pl	r1
-	bf/s	.L29
-	add	#-1,r0
-	mov	r4,r9
-	mov	r4,r8
-	add	#4,r9
-	mov	r4,r6
-	add	#6,r8
-	add	#2,r6
-.L31:
-	mov.w	@r5+,r1
-	mov.w	@r5+,r2
-	mov.w	@r5+,r3
-	mov.w	@r5+,r7
-	mov.w	r1,@r4
-	mov.w	r2,@r6
+#if __BYTE_ORDER == __BIG_ENDIAN
+	mov.w	r0,@(2,r4)
+	shlr16	r0
+	mov.w	r0,@r4
+	mov	r1,r0
+	mov.w	r0,@(6,r4)
+	shlr16	r0
+	mov.w	r0,@(4,r4)
+#else
+	mov.w	r0,@r4
+	shlr16	r0
+	mov.w	r0,@(2,r4)
+	mov	r1,r0
+	mov.w	r0,@(4,r4)
+	shlr16	r0
+	mov.w	r0,@(6,r4)
+#endif
+	bf/s	L_copydest2_loop
 	add	#8,r4
-	mov	r0,r1
-	add	#8,r6
-	mov.w	r3,@r9
-	add	#-1,r0
-	add	#8,r9
-	mov.w	r7,@r8
-	cmp/pl	r1
-	bt/s	.L31
-	add	#8,r8
-.L29:
-	mov	r10,r2
-	mov	r2,r0
-.L40:
-	cmp/pl	r0
-	bf	.L34
-.L35:
-	mov.b	@r5+,r1
-	dt	r2
-	mov.b	r1,@r4
-	bf/s	.L35
+
+	bra	L_byteloop_init
+	nop
+
+	.balign 4
+	/* Copy routine for (dest mod 4) == 0.  */
+L_copydest0:
+	add	#-8,r4
+	.balignw 4,0x0009
+L_copydest0_loop:
+	mov.l	@r5+,r0
+	dt	r7
+	mov.l	@r5+,r1
+	add	#8,r4
+	mov.l	r0,@r4
+	bf/s	L_copydest0_loop
+	mov.l	r1,@(4,r4)
+
+	add	#8,r4		/* Fall through.  */
+
+L_byteloop_init:
+	tst	r6,r6
+	bt	L_exit
+
+	.balignw 4,0x0009
+	/* Copy remaining bytes.  */
+L_byteloop:
+	mov.b	@r5+,r0
+	dt	r6
+	mov.b	r0,@r4
+	bf/s	L_byteloop
 	add	#1,r4
-.L34:
-	mov	r11,r0
-	mov	r14,r15
-	mov.l	@r15+,r14
-	mov.l	@r15+,r11
-	mov.l	@r15+,r10
-	mov.l	@r15+,r9
-	rts	
-	mov.l	@r15+,r8
+
+L_exit:
+	rts
+	mov	r3,r0		/* Return destination.  */
+END(memcpy)
Index: memset.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/sh/memset.S,v
retrieving revision 1.2
diff -u -r1.2 memset.S
--- memset.S	6 Jul 2001 04:56:03 -0000	1.2
+++ memset.S	5 Jul 2002 08:07:17 -0000
@@ -1,6 +1,7 @@
-/* Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1999, 2000, 2002 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
+   Optimized by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -19,61 +20,68 @@
 
 #include <sysdep.h>
 
-/* void *memset (t, c, len)  */
+/* void *memset (t, c, len);  */
 
 ENTRY(memset)
-	tst	r6, r6
-	bt/s	end
-	mov	r4, r3
-	mov	#3, r0
-	cmp/hs	r6, r0
-	bt/s	2f
-	and	r4, r0
-	tst	r0, r0
-	bt/s	1f
-	add	r0, r6
-	add	#-1, r0
-	shll2	r0
-	braf	r0
-	add	#-4, r6
-
-	mov.b	r5, @r4
-	add	#1, r4
-	mov.b	r5, @r4
-	add	#1, r4
-	mov.b	r5, @r4
-	add	#1, r4
-1:
-	extu.b	r5, r0
-	shll8	r5
-	or	r5, r0
-	extu.w	r0, r0
-	mov	r0, r5
-	swap.w	r5, r5
-	or	r0, r5
-	
-2:
-	add	#-4, r6
-	cmp/pz	r6
-	bf	afew
-	mov.l	r5, @r4
-	bra	2b
-	add	#4, r4
-
-afew:
-	mov	#-1, r0
-	sub	r6, r0
-	shll2	r0
-	braf	r0
-	nop
-
-	mov.b	r5, @r4
-	add	#1, r4
-	mov.b	r5, @r4
-	add	#1, r4
-	mov.b	r5, @r4
-	add	#1, r4
-end:
+	mov	#12,r0
+	cmp/gt	r6,r0
+	bt.s	L_byte_loop_init
+	mov	r4,r7
+
+	swap.b	r5,r1
+	or	r1,r5
+	swap.w	r5,r1
+	or	r1,r5
+
+	mov	r4,r0
+	tst	#1,r0
+	bt	L_wordalign
+
+	mov.b	r5,@r4
+	add	#-1,r6	
+	add	#1,r4
+	mov	r4,r0
+
+	.balignw 4,0x0009
+L_wordalign:
+	tst	#2,r0
+	bt	L_word_loop_init
+
+	mov.w	r5,@r4
+	add	#-2,r6
+	add	#2,r4
+	mov	r4,r0
+
+	.balignw 4,0x0009
+L_word_loop_init:
+	mov	r6,r3
+	shlr2	r3
+	mov	#7,r0
+	shlr	r3
+	and	r0,r6
+
+	.balignw 4,0x0009
+L_2word_loop:
+	mov.l	r5,@r4
+	dt	r3
+	mov.l	r5,@(4,r4)
+	bf.s	L_2word_loop
+	add	#8,r4
+
+	.balignw 4,0x0009
+L_byte_loop_init:
+	tst	r6,r6
+	bt	L_byte_exit
+
+	.balignw 4,0x0009
+L_byte_loop:
+	mov.b	r5,@r4
+	dt	r6
+	bf.s	L_byte_loop
+	add	#1,r4
+
+	.balignw 4,0x0009
+L_byte_exit:
 	rts
-	mov	r3, r0
+	mov	r7,r0
 END(memset)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: SH: optimized memcpy/memset
  2002-07-05  1:33 SH: optimized memcpy/memset kaz Kojima
@ 2002-07-09 23:33 ` Ulrich Drepper
  2002-07-09 23:50   ` kaz Kojima
  0 siblings, 1 reply; 5+ messages in thread
From: Ulrich Drepper @ 2002-07-09 23:33 UTC (permalink / raw)
  To: kaz Kojima; +Cc: GNU libc hacker

[-- Attachment #1: Type: text/plain, Size: 519 bytes --]

On Fri, 2002-07-05 at 01:18, kaz Kojima wrote:

> This patch gives an optimized memcpy/memset for SH. The original patch
> was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.

We need an assignment.  I don't trust semi-conductor companies
contributing code without one.

-- 
---------------.                          ,-.   1325 Chesapeake Terrace
Ulrich Drepper  \    ,-------------------'   \  Sunnyvale, CA 94089 USA
Red Hat          `--' drepper at redhat.com   `------------------------

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 232 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: SH: optimized memcpy/memset
  2002-07-09 23:33 ` Ulrich Drepper
@ 2002-07-09 23:50   ` kaz Kojima
  2003-01-03  3:57     ` kaz Kojima
  0 siblings, 1 reply; 5+ messages in thread
From: kaz Kojima @ 2002-07-09 23:50 UTC (permalink / raw)
  To: libc-hacker

Ulrich Drepper <drepper@redhat.com> wrote:
> On Fri, 2002-07-05 at 01:18, kaz Kojima wrote:
> 
>> This patch gives an optimized memcpy/memset for SH. The original patch
>> was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.
> 
> We need an assignment.  I don't trust semi-conductor companies
> contributing code without one.

I'll request Morita-san to prepare an assignment for it.
Thanks,

	kaz

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: SH: optimized memcpy/memset
  2002-07-09 23:50   ` kaz Kojima
@ 2003-01-03  3:57     ` kaz Kojima
  2003-01-03  9:59       ` Ulrich Drepper
  0 siblings, 1 reply; 5+ messages in thread
From: kaz Kojima @ 2003-01-03  3:57 UTC (permalink / raw)
  To: libc-hacker; +Cc: toshiyasu.morita

Hi,

I wrote:
> Ulrich Drepper <drepper@redhat.com> wrote:
>> On Fri, 2002-07-05 at 01:18, kaz Kojima wrote:
>> 
>>> This patch gives an optimized memcpy/memset for SH. The original patch
>>> was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.
>> 
>> We need an assignment.  I don't trust semi-conductor companies
>> contributing code without one.
> 
> I'll request Morita-san to prepare an assignment for it.

Now his and his company's paperwork with FSF has been done.
So, is the patch
<URL:http://sources.redhat.com/ml/libc-hacker/2002-07/msg00010.html>
ok?

Regards,
	kaz

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: SH: optimized memcpy/memset
  2003-01-03  3:57     ` kaz Kojima
@ 2003-01-03  9:59       ` Ulrich Drepper
  0 siblings, 0 replies; 5+ messages in thread
From: Ulrich Drepper @ 2003-01-03  9:59 UTC (permalink / raw)
  To: kaz Kojima; +Cc: libc-hacker, toshiyasu.morita

kaz Kojima wrote:

> Now his and his company's paperwork with FSF has been done.

I haven't received a notification about this but I believe you that it
happened.  I've applied the patch now.  Thanks,

-- 
--------------.                        ,-.            444 Castro Street
Ulrich Drepper \    ,-----------------'   \ Mountain View, CA 94041 USA
Red Hat         `--' drepper at redhat.com `---------------------------

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2003-01-03  9:59 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-07-05  1:33 SH: optimized memcpy/memset kaz Kojima
2002-07-09 23:33 ` Ulrich Drepper
2002-07-09 23:50   ` kaz Kojima
2003-01-03  3:57     ` kaz Kojima
2003-01-03  9:59       ` Ulrich Drepper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).