* [PATCH] S390: Optimize __memset_z196.
@ 2020-06-19 13:51 Stefan Liebler
2020-06-25 8:18 ` Stefan Liebler
0 siblings, 1 reply; 3+ messages in thread
From: Stefan Liebler @ 2020-06-19 13:51 UTC (permalink / raw)
To: libc-alpha; +Cc: Stefan Liebler
It turned out that an 256b-mvc instruction which depends on the
result of a previous 256b-mvc instruction is counterproductive.
Therefore this patch adjusts the 256b-loop by storing the
first byte with stc and setting the remaining 255b with mvc.
Now the 255b-mvc instruction depends on the stc instruction.
---
sysdeps/s390/memset-z900.S | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S
index ca3eac0522..1e0c334156 100644
--- a/sysdeps/s390/memset-z900.S
+++ b/sysdeps/s390/memset-z900.S
@@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196)
# if !defined __s390x__
llgfr %r4,%r4
# endif /* !defined __s390x__ */
- ltgr %r4,%r4
- je .L_Z196_4
+ clgfi %r4,1
+ jl .L_Z196_4 # n == 0
stc %r3,0(%r2)
+ je .L_Z196_4 # n == 1
+ aghi %r4,-2
lgr %r1,%r2
- cghi %r4,1
- je .L_Z196_4
- aghi %r4,-2
- srlg %r5,%r4,8
- ltgr %r5,%r5
- jne .L_Z196_1
+ risbg %r5,%r4,8,128+63,56 # r5 = n / 256
+ jne .L_Z196_1 # Jump away if r5 != 0
.L_Z196_3:
exrl %r4,.L_Z196_17
.L_Z196_4:
br %r14
.L_Z196_1:
cgfi %r5,1048576
- jh __memset_mvcle # Switch to mvcle for >256MB
+ jh __memset_mvcle # Switch to mvcle for >256MB
.L_Z196_2:
pfd 2,1024(%r1)
- mvc 1(256,%r1),0(%r1)
+ mvc 1(255,%r1),0(%r1)
aghi %r5,-1
la %r1,256(%r1)
+ stc %r3,0(%r1)
jne .L_Z196_2
j .L_Z196_3
.L_Z196_17:
--
2.25.0
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] S390: Optimize __memset_z196.
2020-06-19 13:51 [PATCH] S390: Optimize __memset_z196 Stefan Liebler
@ 2020-06-25 8:18 ` Stefan Liebler
2020-06-26 7:47 ` Stefan Liebler
0 siblings, 1 reply; 3+ messages in thread
From: Stefan Liebler @ 2020-06-25 8:18 UTC (permalink / raw)
To: GNU C Library
Just as information, if nobody opposes, I'll commit this patch tomorrow.
On 6/19/20 3:51 PM, Stefan Liebler wrote:
> It turned out that an 256b-mvc instruction which depends on the
> result of a previous 256b-mvc instruction is counterproductive.
> Therefore this patch adjusts the 256b-loop by storing the
> first byte with stc and setting the remaining 255b with mvc.
> Now the 255b-mvc instruction depends on the stc instruction.
> ---
> sysdeps/s390/memset-z900.S | 19 +++++++++----------
> 1 file changed, 9 insertions(+), 10 deletions(-)
>
> diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S
> index ca3eac0522..1e0c334156 100644
> --- a/sysdeps/s390/memset-z900.S
> +++ b/sysdeps/s390/memset-z900.S
> @@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196)
> # if !defined __s390x__
> llgfr %r4,%r4
> # endif /* !defined __s390x__ */
> - ltgr %r4,%r4
> - je .L_Z196_4
> + clgfi %r4,1
> + jl .L_Z196_4 # n == 0
> stc %r3,0(%r2)
> + je .L_Z196_4 # n == 1
> + aghi %r4,-2
> lgr %r1,%r2
> - cghi %r4,1
> - je .L_Z196_4
> - aghi %r4,-2
> - srlg %r5,%r4,8
> - ltgr %r5,%r5
> - jne .L_Z196_1
> + risbg %r5,%r4,8,128+63,56 # r5 = n / 256
> + jne .L_Z196_1 # Jump away if r5 != 0
> .L_Z196_3:
> exrl %r4,.L_Z196_17
> .L_Z196_4:
> br %r14
> .L_Z196_1:
> cgfi %r5,1048576
> - jh __memset_mvcle # Switch to mvcle for >256MB
> + jh __memset_mvcle # Switch to mvcle for >256MB
> .L_Z196_2:
> pfd 2,1024(%r1)
> - mvc 1(256,%r1),0(%r1)
> + mvc 1(255,%r1),0(%r1)
> aghi %r5,-1
> la %r1,256(%r1)
> + stc %r3,0(%r1)
> jne .L_Z196_2
> j .L_Z196_3
> .L_Z196_17:
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] S390: Optimize __memset_z196.
2020-06-25 8:18 ` Stefan Liebler
@ 2020-06-26 7:47 ` Stefan Liebler
0 siblings, 0 replies; 3+ messages in thread
From: Stefan Liebler @ 2020-06-26 7:47 UTC (permalink / raw)
To: libc-alpha
committed
On 6/25/20 10:18 AM, Stefan Liebler via Libc-alpha wrote:
> Just as information, if nobody opposes, I'll commit this patch tomorrow.
>
> On 6/19/20 3:51 PM, Stefan Liebler wrote:
>> It turned out that an 256b-mvc instruction which depends on the
>> result of a previous 256b-mvc instruction is counterproductive.
>> Therefore this patch adjusts the 256b-loop by storing the
>> first byte with stc and setting the remaining 255b with mvc.
>> Now the 255b-mvc instruction depends on the stc instruction.
>> ---
>> sysdeps/s390/memset-z900.S | 19 +++++++++----------
>> 1 file changed, 9 insertions(+), 10 deletions(-)
>>
>> diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S
>> index ca3eac0522..1e0c334156 100644
>> --- a/sysdeps/s390/memset-z900.S
>> +++ b/sysdeps/s390/memset-z900.S
>> @@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196)
>> # if !defined __s390x__
>> llgfr %r4,%r4
>> # endif /* !defined __s390x__ */
>> - ltgr %r4,%r4
>> - je .L_Z196_4
>> + clgfi %r4,1
>> + jl .L_Z196_4 # n == 0
>> stc %r3,0(%r2)
>> + je .L_Z196_4 # n == 1
>> + aghi %r4,-2
>> lgr %r1,%r2
>> - cghi %r4,1
>> - je .L_Z196_4
>> - aghi %r4,-2
>> - srlg %r5,%r4,8
>> - ltgr %r5,%r5
>> - jne .L_Z196_1
>> + risbg %r5,%r4,8,128+63,56 # r5 = n / 256
>> + jne .L_Z196_1 # Jump away if r5 != 0
>> .L_Z196_3:
>> exrl %r4,.L_Z196_17
>> .L_Z196_4:
>> br %r14
>> .L_Z196_1:
>> cgfi %r5,1048576
>> - jh __memset_mvcle # Switch to mvcle for >256MB
>> + jh __memset_mvcle # Switch to mvcle for >256MB
>> .L_Z196_2:
>> pfd 2,1024(%r1)
>> - mvc 1(256,%r1),0(%r1)
>> + mvc 1(255,%r1),0(%r1)
>> aghi %r5,-1
>> la %r1,256(%r1)
>> + stc %r3,0(%r1)
>> jne .L_Z196_2
>> j .L_Z196_3
>> .L_Z196_17:
>>
>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-06-26 7:47 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-19 13:51 [PATCH] S390: Optimize __memset_z196 Stefan Liebler
2020-06-25 8:18 ` Stefan Liebler
2020-06-26 7:47 ` Stefan Liebler
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).