* [PATCH] powerpc: st{r,p}cpy optimization for aligned strings
@ 2017-12-07 7:54 Rajalakshmi Srinivasaraghavan
2017-12-15 1:36 ` Tulio Magno Quites Machado Filho
0 siblings, 1 reply; 2+ messages in thread
From: Rajalakshmi Srinivasaraghavan @ 2017-12-07 7:54 UTC (permalink / raw)
To: libc-alpha; +Cc: Rajalakshmi Srinivasaraghavan
This patch makes use of vectors for aligned inputs. Improvements
upto 30% seen for larger aligned inputs.
2017-12-05 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power8/strcpy.S: Use vectors
for aligned inputs.
---
sysdeps/powerpc/powerpc64/power8/strcpy.S | 149 +++++++++++++++++++++++++++++-
1 file changed, 146 insertions(+), 3 deletions(-)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
index 13e7a0fcbc..a1683f9dfe 100644
--- a/sysdeps/powerpc/powerpc64/power8/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -47,7 +47,7 @@
64K as default, the page cross handling assumes minimum page size of
4k. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (FUNC_NAME, 4)
li r0,0 /* Doubleword with null chars to use
with cmpb. */
@@ -120,7 +120,7 @@ L(pagecross):
ldu r8, 8(r7)
L(loop_before):
- /* Save the two doublewords readed from source and align the source
+ /* Save the two doublewords read from source and align the source
to 16 bytes for the loop. */
mr r11,r3
std r12,0(r11)
@@ -129,7 +129,150 @@ L(loop_before):
rldicl r9,r4,0,60
subf r7,r9,r7
subf r11,r9,r11
- b L(loop_start)
+ /* Source is adjusted to 16B alignment and destination r11 is
+ also moved based on that adjustment. Now check if r11 is
+ also 16B aligned to move to vectorized loop. */
+ andi. r6, r11, 0xF
+ bne L(loop_start)
+
+ /* Prepare for the loop. */
+ subf r4, r9, r4 /* Adjust r4 based on alignment. */
+ li r7, 16 /* Load required offsets. */
+ li r8, 32
+ li r9, 48
+ vspltisb v0, 0
+ addi r4, r4, 16
+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+ Else copy 16B till r4 is 64B aligned. */
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4 /* Load 16 bytes from memory. */
+ vcmpequb. v5, v0, v6 /* Check for null. */
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11 /* Store 16 bytes. */
+ addi r4, r4, 16 /* Increment the address. */
+ addi r11, r11, 16
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4
+ vcmpequb. v5, v0, v6
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11
+ addi r4, r4, 16
+ addi r11, r11, 16
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4
+ vcmpequb. v5, v0, v6
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11
+ addi r4, r4, 16
+ addi r11, r11, 16
+
+ .align 4
+L(qw_loop):
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+ b L(qw_loop)
+
+ .align 4
+L(qw_loop_done):
+ /* Null found in one of the 4 loads. */
+ vcmpequb. v7, v1, v0
+ vor v6, v1, v1
+ bne cr6, L(qw_done)
+ /* Not on the first 16B, So store it. */
+ stvx v1, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vcmpequb. v7, v2, v0
+ vor v6, v2, v2
+ bne cr6, L(qw_done)
+ /* Not on the second 16B, So store it. */
+ stvx v2, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vcmpequb. v7, v3, v0
+ vor v6, v3, v3
+ bne cr6, L(qw_done)
+ /* Not on the third 16B, So store it. */
+ stvx v6, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vor v6, v4, v4
+
+ .align 4
+L(qw_done):
+ mr r7, r4
+ /* Move the result to GPR. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v4, v6, v0, 8
+ mfvrd r12, v4
+#else
+ mfvrd r12, v6
+#endif
+ /* Check for null in the first 8 bytes. */
+ cmpb r10, r12, r0
+ cmpdi cr6, r10, 0
+ bne cr6, L(done2)
+ /* Null found in second doubleword. */
+#ifdef __LITTLE_ENDIAN__
+ mfvrd r6, v6
+#else
+ vsldoi v6, v6, v0, 8
+ mfvrd r6, v6
+#endif
+ cmpb r10, r6, r0
+ addi r7, r7, 8
+ b L(done2)
.align 5
L(loop):
--
2.11.0
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] powerpc: st{r,p}cpy optimization for aligned strings
2017-12-07 7:54 [PATCH] powerpc: st{r,p}cpy optimization for aligned strings Rajalakshmi Srinivasaraghavan
@ 2017-12-15 1:36 ` Tulio Magno Quites Machado Filho
0 siblings, 0 replies; 2+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2017-12-15 1:36 UTC (permalink / raw)
To: Rajalakshmi Srinivasaraghavan, libc-alpha
Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> writes:
> This patch makes use of vectors for aligned inputs. Improvements
> upto 30% seen for larger aligned inputs.
>
> 2017-12-05 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> * sysdeps/powerpc/powerpc64/power8/strcpy.S: Use vectors
> for aligned inputs.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
--
Tulio Magno
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2017-12-15 1:36 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-07 7:54 [PATCH] powerpc: st{r,p}cpy optimization for aligned strings Rajalakshmi Srinivasaraghavan
2017-12-15 1:36 ` Tulio Magno Quites Machado Filho
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).