* SH: optimized memcpy/memset
@ 2002-07-05 1:33 kaz Kojima
2002-07-09 23:33 ` Ulrich Drepper
0 siblings, 1 reply; 5+ messages in thread
From: kaz Kojima @ 2002-07-05 1:33 UTC (permalink / raw)
To: libc-hacker
Hi,
This patch gives an optimized memcpy/memset for SH. The original patch
was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.
kaz
--
2002-07-05 Kaz Kojima <kkojima@rr.iij4u.or.jp>
* sysdeps/sh/memcpy.S: Optimize. Based on a patch by Toshiyasu
Morita <toshiyasu.morita@hsa.hitachi.com>.
* sysdeps/sh/memcpy.S: Likewise.
Index: memcpy.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/sh/memcpy.S,v
retrieving revision 1.2
diff -u -r1.2 memcpy.S
--- memcpy.S 6 Jul 2001 04:56:03 -0000 1.2
+++ memcpy.S 5 Jul 2002 08:07:17 -0000
@@ -1,5 +1,7 @@
-/* Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1999, 2000, 2002 Free Software Foundation, Inc.
This file is part of the GNU C Library.
+ Contributed by Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
+ Optimized by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -19,213 +21,179 @@
#include <sysdep.h>
#include <endian.h>
-/*
- * void *memcpy(void *dst, const void *src, size_t n);
- * No overlap between the memory of DST and of SRC are assumed.
- */
+/* void *memcpy(void *dst, const void *src, size_t n);
+ No overlap between the memory of DST and of SRC are assumed. */
ENTRY(memcpy)
- tst r6,r6
- bt/s 1f
- mov r4,r0
- mov #12,r1
- cmp/gt r6,r1
- bf 2f
-0:
- mov.b @r5+,r1
- dt r6
+ mov r4,r3 /* Save destination. */
+
+ /* If less than 11 bytes, just do a byte copy. */
+ mov #11,r0
+ cmp/gt r6,r0
+ bt L_byteloop_init
+
+ /* Check if we need to word-align source. */
+ mov r5,r0
+ tst #1,r0
+ bt L_wordalign
+
+ mov.b @r0+,r1 /* Copy one byte. */
+ add #-1,r6
mov.b r1,@r4
- bf/s 0b
add #1,r4
-1:
- rts
- nop
-2:
- mov.l r8,@-r15
- mov.l r9,@-r15
- mov r6,r2
- mov.l r10,@-r15
- mov.l r11,@-r15
- mov.l r14,@-r15
- mov r4,r11
- mov r15,r14
- mov r5,r0
- and #1,r0
- tst r0,r0
- bt/s .L42
- mov r5,r0
- mov.b @r5+,r1
- add #-1,r2
+
+ .balignw 4,0x0009
+L_wordalign:
+ /* Check if we need to longword-align source. */
+ tst #2,r0
+ bt L_copy
+
+ mov.w @r0+,r1 /* Copy one word. */
+ add #-2,r6
+#if __BYTE_ORDER == __BIG_ENDIAN
add #1,r4
- mov.b r1,@r11
- mov r5,r0
-.L42:
- and #2,r0
- tst r0,r0
- bt/s .L43
- mov r4,r0
- mov.b @r5+,r1
mov.b r1,@r4
- mov.b @r5+,r1
+ shlr8 r1
+ mov.b r1,@-r4
+ add #2,r4
+#else
+ mov.b r1,@r4
add #1,r4
- add #-2,r2
+ shlr8 r1
mov.b r1,@r4
add #1,r4
+#endif
+L_copy:
+ mov r0,r5
+
+ /* Calculate the correct routine to handle the destination
+ alignment and simultaneously calculate the loop counts for
+ both the 2 word copy loop and byte copy loop. */
+ mova L_jumptable,r0
+ mov r0,r1
mov r4,r0
-.L43:
- and #1,r0
- tst r0,r0
- bf/s .L38
- mov r4,r0
- and #2,r0
- tst r0,r0
- bf/s .L7
- mov r2,r0
- shlr2 r0
+ mov r6,r7
and #3,r0
- cmp/eq #2,r0
- bt/s .L10
- mov #2,r1
- cmp/gt r1,r0
- bt/s .L14
- cmp/eq #3,r0
- cmp/eq #1,r0
- bt/s .L11
- mov r0,r1
- bra .L44
- shll2 r1
- .align 5
-.L14:
- bf .L8
- mov.l @(8,r5),r1
- mov.l r1,@(8,r4)
-.L10:
- mov.l @(4,r5),r1
- mov.l r1,@(4,r4)
-.L11:
- mov.l @r5,r1
- mov.l r1,@r4
-.L8:
- mov r0,r1
- shll2 r1
-.L44:
- add r1,r4
- add r1,r5
- mov r2,r0
- mov #-4,r1
- shad r1,r0
- mov #3,r6
- bra .L37
- and r2,r6
- .align 5
-.L18:
+ shlr2 r7
+ shll r0
+ shlr r7
+ mov.w @(r0,r1),r2
+ mov #7,r0
+ braf r2
+ and r0,r6
+L_base:
+
+ .balign 4
+L_jumptable:
+ .word L_copydest0 - L_base
+ .word L_copydest1_or_3 - L_base
+ .word L_copydest2 - L_base
+ .word L_copydest1_or_3 - L_base
+
+ .balign 4
+ /* Copy routine for (dest mod 4) == 1 or == 3. */
+L_copydest1_or_3:
+ add #-1,r4
+ .balignw 4,0x0009
+L_copydest1_or_3_loop:
+ mov.l @r5+,r0 /* Read first longword. */
+ dt r7
+ mov.l @r5+,r1 /* Read second longword. */
+#if __BYTE_ORDER == __BIG_ENDIAN
+ /* Write first longword as byte, word, byte. */
+ mov.b r0,@(4,r4)
+ shlr8 r0
+ mov.w r0,@(2,r4)
+ shlr16 r0
+ mov.b r0,@(1,r4)
+ mov r1,r0
+ /* Write second longword as byte, word, byte. */
+ mov.b r0,@(8,r4)
+ shlr8 r0
+ mov.w r0,@(6,r4)
+ shlr16 r0
+ mov.b r0,@(5,r4)
+#else
+ /* Write first longword as byte, word, byte. */
+ mov.b r0,@(1,r4)
+ shlr8 r0
+ mov.w r0,@(2,r4)
+ shlr16 r0
+ mov.b r0,@(4,r4)
+ mov r1,r0
+ /* Write second longword as byte, word, byte. */
+ mov.b r0,@(5,r4)
+ shlr8 r0
+ mov.w r0,@(6,r4)
+ shlr16 r0
+ mov.b r0,@(8,r4)
+#endif
+ bf/s L_copydest1_or_3_loop
+ add #8,r4
+
+ bra L_byteloop_init
+ add #1,r4
+
+ .balign 4
+ /* Copy routine for (dest mod 4) == 2. */
+L_copydest2:
+L_copydest2_loop:
+ mov.l @r5+,r0
+ dt r7
mov.l @r5+,r1
- mov.l @r5+,r2
- mov.l @r5+,r3
- mov.l @r5+,r7
- mov.l r1,@r4
- mov.l r2,@(4,r4)
- mov.l r3,@(8,r4)
- mov.l r7,@(12,r4)
- add #16,r4
-.L37:
- cmp/pl r0
- bt/s .L18
- add #-1,r0
- mov r6,r2
-.L38:
- bra .L40
- mov r2,r0
- .align 5
-.L7:
- shar r0
- and #3,r0
- cmp/eq #2,r0
- bt/s .L23
- mov #2,r1
- cmp/gt r1,r0
- bt/s .L27
- cmp/eq #3,r0
- cmp/eq #1,r0
- bt/s .L24
- mov r0,r1
- bra .L45
- add r0,r1
- .align 5
-.L27:
- bf .L21
- add #4,r5
- mov.w @r5,r1
- add #4,r4
- mov.w r1,@r4
- add #-4,r5
- add #-4,r4
-.L23:
- add #2,r5
- mov.w @r5,r1
- add #2,r4
- mov.w r1,@r4
- add #-2,r5
- add #-2,r4
-.L24:
- mov.w @r5,r1
- mov.w r1,@r4
-.L21:
- mov r0,r1
- add r0,r1
-.L45:
- add r1,r4
- add r1,r5
- mov r2,r0
- mov #-3,r1
- shad r1,r0
- mov #1,r10
- mov r0,r1
- and r2,r10
- cmp/pl r1
- bf/s .L29
- add #-1,r0
- mov r4,r9
- mov r4,r8
- add #4,r9
- mov r4,r6
- add #6,r8
- add #2,r6
-.L31:
- mov.w @r5+,r1
- mov.w @r5+,r2
- mov.w @r5+,r3
- mov.w @r5+,r7
- mov.w r1,@r4
- mov.w r2,@r6
+#if __BYTE_ORDER == __BIG_ENDIAN
+ mov.w r0,@(2,r4)
+ shlr16 r0
+ mov.w r0,@r4
+ mov r1,r0
+ mov.w r0,@(6,r4)
+ shlr16 r0
+ mov.w r0,@(4,r4)
+#else
+ mov.w r0,@r4
+ shlr16 r0
+ mov.w r0,@(2,r4)
+ mov r1,r0
+ mov.w r0,@(4,r4)
+ shlr16 r0
+ mov.w r0,@(6,r4)
+#endif
+ bf/s L_copydest2_loop
add #8,r4
- mov r0,r1
- add #8,r6
- mov.w r3,@r9
- add #-1,r0
- add #8,r9
- mov.w r7,@r8
- cmp/pl r1
- bt/s .L31
- add #8,r8
-.L29:
- mov r10,r2
- mov r2,r0
-.L40:
- cmp/pl r0
- bf .L34
-.L35:
- mov.b @r5+,r1
- dt r2
- mov.b r1,@r4
- bf/s .L35
+
+ bra L_byteloop_init
+ nop
+
+ .balign 4
+ /* Copy routine for (dest mod 4) == 0. */
+L_copydest0:
+ add #-8,r4
+ .balignw 4,0x0009
+L_copydest0_loop:
+ mov.l @r5+,r0
+ dt r7
+ mov.l @r5+,r1
+ add #8,r4
+ mov.l r0,@r4
+ bf/s L_copydest0_loop
+ mov.l r1,@(4,r4)
+
+ add #8,r4 /* Fall through. */
+
+L_byteloop_init:
+ tst r6,r6
+ bt L_exit
+
+ .balignw 4,0x0009
+ /* Copy remaining bytes. */
+L_byteloop:
+ mov.b @r5+,r0
+ dt r6
+ mov.b r0,@r4
+ bf/s L_byteloop
add #1,r4
-.L34:
- mov r11,r0
- mov r14,r15
- mov.l @r15+,r14
- mov.l @r15+,r11
- mov.l @r15+,r10
- mov.l @r15+,r9
- rts
- mov.l @r15+,r8
+
+L_exit:
+ rts
+ mov r3,r0 /* Return destination. */
+END(memcpy)
Index: memset.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/sh/memset.S,v
retrieving revision 1.2
diff -u -r1.2 memset.S
--- memset.S 6 Jul 2001 04:56:03 -0000 1.2
+++ memset.S 5 Jul 2002 08:07:17 -0000
@@ -1,6 +1,7 @@
-/* Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1999, 2000, 2002 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
+ Optimized by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -19,61 +20,68 @@
#include <sysdep.h>
-/* void *memset (t, c, len) */
+/* void *memset (t, c, len); */
ENTRY(memset)
- tst r6, r6
- bt/s end
- mov r4, r3
- mov #3, r0
- cmp/hs r6, r0
- bt/s 2f
- and r4, r0
- tst r0, r0
- bt/s 1f
- add r0, r6
- add #-1, r0
- shll2 r0
- braf r0
- add #-4, r6
-
- mov.b r5, @r4
- add #1, r4
- mov.b r5, @r4
- add #1, r4
- mov.b r5, @r4
- add #1, r4
-1:
- extu.b r5, r0
- shll8 r5
- or r5, r0
- extu.w r0, r0
- mov r0, r5
- swap.w r5, r5
- or r0, r5
-
-2:
- add #-4, r6
- cmp/pz r6
- bf afew
- mov.l r5, @r4
- bra 2b
- add #4, r4
-
-afew:
- mov #-1, r0
- sub r6, r0
- shll2 r0
- braf r0
- nop
-
- mov.b r5, @r4
- add #1, r4
- mov.b r5, @r4
- add #1, r4
- mov.b r5, @r4
- add #1, r4
-end:
+ mov #12,r0
+ cmp/gt r6,r0
+ bt.s L_byte_loop_init
+ mov r4,r7
+
+ swap.b r5,r1
+ or r1,r5
+ swap.w r5,r1
+ or r1,r5
+
+ mov r4,r0
+ tst #1,r0
+ bt L_wordalign
+
+ mov.b r5,@r4
+ add #-1,r6
+ add #1,r4
+ mov r4,r0
+
+ .balignw 4,0x0009
+L_wordalign:
+ tst #2,r0
+ bt L_word_loop_init
+
+ mov.w r5,@r4
+ add #-2,r6
+ add #2,r4
+ mov r4,r0
+
+ .balignw 4,0x0009
+L_word_loop_init:
+ mov r6,r3
+ shlr2 r3
+ mov #7,r0
+ shlr r3
+ and r0,r6
+
+ .balignw 4,0x0009
+L_2word_loop:
+ mov.l r5,@r4
+ dt r3
+ mov.l r5,@(4,r4)
+ bf.s L_2word_loop
+ add #8,r4
+
+ .balignw 4,0x0009
+L_byte_loop_init:
+ tst r6,r6
+ bt L_byte_exit
+
+ .balignw 4,0x0009
+L_byte_loop:
+ mov.b r5,@r4
+ dt r6
+ bf.s L_byte_loop
+ add #1,r4
+
+ .balignw 4,0x0009
+L_byte_exit:
rts
- mov r3, r0
+ mov r7,r0
END(memset)
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: SH: optimized memcpy/memset
2002-07-05 1:33 SH: optimized memcpy/memset kaz Kojima
@ 2002-07-09 23:33 ` Ulrich Drepper
2002-07-09 23:50 ` kaz Kojima
0 siblings, 1 reply; 5+ messages in thread
From: Ulrich Drepper @ 2002-07-09 23:33 UTC (permalink / raw)
To: kaz Kojima; +Cc: GNU libc hacker
[-- Attachment #1: Type: text/plain, Size: 519 bytes --]
On Fri, 2002-07-05 at 01:18, kaz Kojima wrote:
> This patch gives an optimized memcpy/memset for SH. The original patch
> was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.
We need an assignment. I don't trust semi-conductor companies
contributing code without one.
--
---------------. ,-. 1325 Chesapeake Terrace
Ulrich Drepper \ ,-------------------' \ Sunnyvale, CA 94089 USA
Red Hat `--' drepper at redhat.com `------------------------
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 232 bytes --]
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: SH: optimized memcpy/memset
2002-07-09 23:33 ` Ulrich Drepper
@ 2002-07-09 23:50 ` kaz Kojima
2003-01-03 3:57 ` kaz Kojima
0 siblings, 1 reply; 5+ messages in thread
From: kaz Kojima @ 2002-07-09 23:50 UTC (permalink / raw)
To: libc-hacker
Ulrich Drepper <drepper@redhat.com> wrote:
> On Fri, 2002-07-05 at 01:18, kaz Kojima wrote:
>
>> This patch gives an optimized memcpy/memset for SH. The original patch
>> was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.
>
> We need an assignment. I don't trust semi-conductor companies
> contributing code without one.
I'll request Morita-san to prepare an assignment for it.
Thanks,
kaz
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: SH: optimized memcpy/memset
2002-07-09 23:50 ` kaz Kojima
@ 2003-01-03 3:57 ` kaz Kojima
2003-01-03 9:59 ` Ulrich Drepper
0 siblings, 1 reply; 5+ messages in thread
From: kaz Kojima @ 2003-01-03 3:57 UTC (permalink / raw)
To: libc-hacker; +Cc: toshiyasu.morita
Hi,
I wrote:
> Ulrich Drepper <drepper@redhat.com> wrote:
>> On Fri, 2002-07-05 at 01:18, kaz Kojima wrote:
>>
>>> This patch gives an optimized memcpy/memset for SH. The original patch
>>> was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.
>>
>> We need an assignment. I don't trust semi-conductor companies
>> contributing code without one.
>
> I'll request Morita-san to prepare an assignment for it.
Now his and his company's paperwork with FSF has been done.
So, is the patch
<URL:http://sources.redhat.com/ml/libc-hacker/2002-07/msg00010.html>
ok?
Regards,
kaz
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: SH: optimized memcpy/memset
2003-01-03 3:57 ` kaz Kojima
@ 2003-01-03 9:59 ` Ulrich Drepper
0 siblings, 0 replies; 5+ messages in thread
From: Ulrich Drepper @ 2003-01-03 9:59 UTC (permalink / raw)
To: kaz Kojima; +Cc: libc-hacker, toshiyasu.morita
kaz Kojima wrote:
> Now his and his company's paperwork with FSF has been done.
I haven't received a notification about this but I believe you that it
happened. I've applied the patch now. Thanks,
--
--------------. ,-. 444 Castro Street
Ulrich Drepper \ ,-----------------' \ Mountain View, CA 94041 USA
Red Hat `--' drepper at redhat.com `---------------------------
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2003-01-03 9:59 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-07-05 1:33 SH: optimized memcpy/memset kaz Kojima
2002-07-09 23:33 ` Ulrich Drepper
2002-07-09 23:50 ` kaz Kojima
2003-01-03 3:57 ` kaz Kojima
2003-01-03 9:59 ` Ulrich Drepper
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).