[2.27 COMMITTED][AArch64] Backport memcpy

public inbox for libc-stable@sourceware.org
 help / color / mirror / Atom feed

* [2.27 COMMITTED][AArch64] Backport memcpy_falkor improvements
@ 2019-01-01  0:00 Wilco Dijkstra
  0 siblings, 0 replies; 2+ messages in thread
From: Wilco Dijkstra @ 2019-01-01  0:00 UTC (permalink / raw)
  To: libc-stable; +Cc: nd

commit e6b7252040755cc965e71622084b9b5ee05345ff
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Fri Jun 29 22:45:59 2018 +0530

    aarch64,falkor: Use vector registers for memcpy
    
    Vector registers perform better than scalar register pairs for copying
    data so prefer them instead.  This results in a time reduction of over
    50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk.
    Larger sizes show improvements of around 1% to 2%.  memcpy-random shows
    a very small improvement, in the range of 1-2%.
    
        * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
        Use vector registers.
    
    (cherry picked from commit 0aec4c1d1801e8016ebe89281d16597e0557b8be)

commit c74b884f705aa54998c4b94ac8b098b3ac40e465
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Fri May 11 00:11:52 2018 +0530

    aarch64,falkor: Ignore prefetcher tagging for smaller copies
    
    For smaller and medium sized copies, the effect of hardware
    prefetching are not as dominant as instruction level parallelism.
    Hence it makes more sense to load data into multiple registers than to
    try and route them to the same prefetch unit.  This is also the case
    for the loop exit where we are unable to latch on to the same prefetch
    unit anyway so it makes more sense to have data loaded in parallel.
    
    The performance results are a bit mixed with memcpy-random, with
    numbers jumping between -1% and +3%, i.e. the numbers don't seem
    repeatable.  memcpy-walk sees a 70% improvement (i.e. > 2x) for 128
    bytes and that improvement reduces down as the impact of the tail copy
    decreases in comparison to the loop.
    
        * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
        Use multiple registers to copy data in loop tail.
    
    (cherry picked from commit db725a458e1cb0e17204daa543744faf08bb2e06)

diff --git a/ChangeLog b/ChangeLog
index 99b6180..dd2106c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
+       * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+       Use vector registers.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+       * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+       Use multiple registers to copy data in loop tail.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
        * sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of
        mov + lsr.
 
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index 8dd8c1e..cdc2de4 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -29,11 +29,19 @@
 #define dst    x3
 #define srcend x4
 #define dstend x5
-#define A_l    x6
-#define A_lw   w6
-#define A_h    x7
-#define A_hw   w7
 #define tmp1   x14
+#define A_x    x6
+#define B_x    x7
+#define A_w    w6
+#define B_w    w7
+
+#define A_q    q0
+#define B_q    q1
+#define C_q    q2
+#define D_q    q3
+#define E_q    q4
+#define F_q    q5
+#define G_q    q6
 
 /* Copies are split into 3 main cases:
 
@@ -53,9 +61,9 @@
    bumping up the small copies up to 32 bytes allows us to do that without
    cost and also allows us to reduce the size of the prep code before loop64.
 
-   All copies are done only via two registers r6 and r7.  This is to ensure
-   that all loads hit a single hardware prefetcher which can get correctly
-   trained to prefetch a single stream.
+   The copy loop uses only one register q0.  This is to ensure that all loads
+   hit a single hardware prefetcher which can get correctly trained to prefetch
+   a single stream.
 
    The non-temporal stores help optimize cache utilization.  */
 
@@ -66,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
        add     srcend, src, count
        add     dstend, dstin, count
        b.ls    L(copy32)
-       ldp     A_l, A_h, [src]
+       ldr     A_q, [src]
        cmp     count, 128
-       stp     A_l, A_h, [dstin]
+       str     A_q, [dstin]
        b.hi    L(copy_long)
 
        /* Medium copies: 33..128 bytes.  */
        sub     tmp1, count, 1
-       ldp     A_l, A_h, [src, 16]
-       stp     A_l, A_h, [dstin, 16]
+       ldr     A_q, [src, 16]
+       ldr     B_q, [srcend, -32]
+       ldr     C_q, [srcend, -16]
        tbz     tmp1, 6, 1f
-       ldp     A_l, A_h, [src, 32]
-       stp     A_l, A_h, [dstin, 32]
-       ldp     A_l, A_h, [src, 48]
-       stp     A_l, A_h, [dstin, 48]
-       ldp     A_l, A_h, [srcend, -64]
-       stp     A_l, A_h, [dstend, -64]
-       ldp     A_l, A_h, [srcend, -48]
-       stp     A_l, A_h, [dstend, -48]
+       ldr     D_q, [src, 32]
+       ldr     E_q, [src, 48]
+       str     D_q, [dstin, 32]
+       str     E_q, [dstin, 48]
+       ldr     F_q, [srcend, -64]
+       ldr     G_q, [srcend, -48]
+       str     F_q, [dstend, -64]
+       str     G_q, [dstend, -48]
 1:
-       ldp     A_l, A_h, [srcend, -32]
-       stp     A_l, A_h, [dstend, -32]
-       ldp     A_l, A_h, [srcend, -16]
-       stp     A_l, A_h, [dstend, -16]
+       str     A_q, [dstin, 16]
+       str     B_q, [dstend, -32]
+       str     C_q, [dstend, -16]
        ret
 
        .p2align 4
@@ -97,44 +105,44 @@ L(copy32):
        /* 16-32 */
        cmp     count, 16
        b.lo    1f
-       ldp     A_l, A_h, [src]
-       stp     A_l, A_h, [dstin]
-       ldp     A_l, A_h, [srcend, -16]
-       stp     A_l, A_h, [dstend, -16]
+       ldr     A_q, [src]
+       ldr     B_q, [srcend, -16]
+       str     A_q, [dstin]
+       str     B_q, [dstend, -16]
        ret
        .p2align 4
 1:
        /* 8-15 */
        tbz     count, 3, 1f
-       ldr     A_l, [src]
-       str     A_l, [dstin]
-       ldr     A_l, [srcend, -8]
-       str     A_l, [dstend, -8]
+       ldr     A_x, [src]
+       ldr     B_x, [srcend, -8]
+       str     A_x, [dstin]
+       str     B_x, [dstend, -8]
        ret
        .p2align 4
 1:
        /* 4-7 */
        tbz     count, 2, 1f
-       ldr     A_lw, [src]
-       str     A_lw, [dstin]
-       ldr     A_lw, [srcend, -4]
-       str     A_lw, [dstend, -4]
+       ldr     A_w, [src]
+       ldr     B_w, [srcend, -4]
+       str     A_w, [dstin]
+       str     B_w, [dstend, -4]
        ret
        .p2align 4
 1:
        /* 2-3 */
        tbz     count, 1, 1f
-       ldrh    A_lw, [src]
-       strh    A_lw, [dstin]
-       ldrh    A_lw, [srcend, -2]
-       strh    A_lw, [dstend, -2]
+       ldrh    A_w, [src]
+       ldrh    B_w, [srcend, -2]
+       strh    A_w, [dstin]
+       strh    B_w, [dstend, -2]
        ret
        .p2align 4
 1:
        /* 0-1 */
        tbz     count, 0, 1f
-       ldrb    A_lw, [src]
-       strb    A_lw, [dstin]
+       ldrb    A_w, [src]
+       strb    A_w, [dstin]
 1:
        ret
 
@@ -153,30 +161,29 @@ L(copy_long):
        add     count, count, tmp1
 
 L(loop64):
-       ldp     A_l, A_h, [src, 16]!
-       stnp    A_l, A_h, [dst, 16]
-       ldp     A_l, A_h, [src, 16]!
+       ldr     A_q, [src, 16]!
+       str     A_q, [dst, 16]
+       ldr     A_q, [src, 16]!
        subs    count, count, 64
-       stnp    A_l, A_h, [dst, 32]
-       ldp     A_l, A_h, [src, 16]!
-       stnp    A_l, A_h, [dst, 48]
-       ldp     A_l, A_h, [src, 16]!
-       stnp    A_l, A_h, [dst, 64]
-       add     dst, dst, 64
+       str     A_q, [dst, 32]
+       ldr     A_q, [src, 16]!
+       str     A_q, [dst, 48]
+       ldr     A_q, [src, 16]!
+       str     A_q, [dst, 64]!
        b.hi    L(loop64)
 
        /* Write the last full set of 64 bytes.  The remainder is at most 64
           bytes, so it is safe to always copy 64 bytes from the end even if
           there is just 1 byte left.  */
 L(last64):
-       ldp     A_l, A_h, [srcend, -64]
-       stnp    A_l, A_h, [dstend, -64]
-       ldp     A_l, A_h, [srcend, -48]
-       stnp    A_l, A_h, [dstend, -48]
-       ldp     A_l, A_h, [srcend, -32]
-       stnp    A_l, A_h, [dstend, -32]
-       ldp     A_l, A_h, [srcend, -16]
-       stnp    A_l, A_h, [dstend, -16]
+       ldr     E_q, [srcend, -64]
+       str     E_q, [dstend, -64]
+       ldr     D_q, [srcend, -48]
+       str     D_q, [dstend, -48]
+       ldr     C_q, [srcend, -32]
+       str     C_q, [dstend, -32]
+       ldr     B_q, [srcend, -16]
+       str     B_q, [dstend, -16]
        ret
 
 END (__memcpy_falkor)

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [2.27 COMMITTED][AArch64] Backport memcpy_falkor improvements
@ 2019-01-01  0:00 Wilco Dijkstra
  0 siblings, 0 replies; 2+ messages in thread
From: Wilco Dijkstra @ 2019-01-01  0:00 UTC (permalink / raw)
  To: libc-stable; +Cc: nd

commit ad64510e5c74729108a02a6c22f03aa8ee07a8d3
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Fri Jun 29 22:45:59 2018 +0530

    aarch64,falkor: Use vector registers for memcpy
    
    Vector registers perform better than scalar register pairs for copying
    data so prefer them instead.  This results in a time reduction of over
    50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk.
    Larger sizes show improvements of around 1% to 2%.  memcpy-random shows
    a very small improvement, in the range of 1-2%.
    
        * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
        Use vector registers.
    
    (cherry picked from commit 0aec4c1d1801e8016ebe89281d16597e0557b8be)

commit d3c05bfffa65b39ba64d62e24ed4a6c118fab3ef
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Fri May 11 00:11:52 2018 +0530

    aarch64,falkor: Ignore prefetcher tagging for smaller copies
    
    For smaller and medium sized copies, the effect of hardware
    prefetching are not as dominant as instruction level parallelism.
    Hence it makes more sense to load data into multiple registers than to
    try and route them to the same prefetch unit.  This is also the case
    for the loop exit where we are unable to latch on to the same prefetch
    unit anyway so it makes more sense to have data loaded in parallel.
    
    The performance results are a bit mixed with memcpy-random, with
    numbers jumping between -1% and +3%, i.e. the numbers don't seem
    repeatable.  memcpy-walk sees a 70% improvement (i.e. > 2x) for 128
    bytes and that improvement reduces down as the impact of the tail copy
    decreases in comparison to the loop.
    
        * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
        Use multiple registers to copy data in loop tail.
    
    (cherry picked from commit db725a458e1cb0e17204daa543744faf08bb2e06)

diff --git a/ChangeLog b/ChangeLog
index e9557b8..0482b0c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
+       * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+       Use vector registers.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+       * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+       Use multiple registers to copy data in loop tail.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
        * sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of
        mov + lsr.
 
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index dea4f22..9cde8dc 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -29,11 +29,19 @@
 #define dst    x3
 #define srcend x4
 #define dstend x5
-#define A_l    x6
-#define A_lw   w6
-#define A_h    x7
-#define A_hw   w7
 #define tmp1   x14
+#define A_x    x6
+#define B_x    x7
+#define A_w    w6
+#define B_w    w7
+
+#define A_q    q0
+#define B_q    q1
+#define C_q    q2
+#define D_q    q3
+#define E_q    q4
+#define F_q    q5
+#define G_q    q6
 
 /* Copies are split into 3 main cases:
 
@@ -53,9 +61,9 @@
    bumping up the small copies up to 32 bytes allows us to do that without
    cost and also allows us to reduce the size of the prep code before loop64.
 
-   All copies are done only via two registers r6 and r7.  This is to ensure
-   that all loads hit a single hardware prefetcher which can get correctly
-   trained to prefetch a single stream.
+   The copy loop uses only one register q0.  This is to ensure that all loads
+   hit a single hardware prefetcher which can get correctly trained to prefetch
+   a single stream.
 
    The non-temporal stores help optimize cache utilization.  */
 
@@ -66,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
        add     srcend, src, count
        add     dstend, dstin, count
        b.ls    L(copy32)
-       ldp     A_l, A_h, [src]
+       ldr     A_q, [src]
        cmp     count, 128
-       stp     A_l, A_h, [dstin]
+       str     A_q, [dstin]
        b.hi    L(copy_long)
 
        /* Medium copies: 33..128 bytes.  */
        sub     tmp1, count, 1
-       ldp     A_l, A_h, [src, 16]
-       stp     A_l, A_h, [dstin, 16]
+       ldr     A_q, [src, 16]
+       ldr     B_q, [srcend, -32]
+       ldr     C_q, [srcend, -16]
        tbz     tmp1, 6, 1f
-       ldp     A_l, A_h, [src, 32]
-       stp     A_l, A_h, [dstin, 32]
-       ldp     A_l, A_h, [src, 48]
-       stp     A_l, A_h, [dstin, 48]
-       ldp     A_l, A_h, [srcend, -64]
-       stp     A_l, A_h, [dstend, -64]
-       ldp     A_l, A_h, [srcend, -48]
-       stp     A_l, A_h, [dstend, -48]
+       ldr     D_q, [src, 32]
+       ldr     E_q, [src, 48]
+       str     D_q, [dstin, 32]
+       str     E_q, [dstin, 48]
+       ldr     F_q, [srcend, -64]
+       ldr     G_q, [srcend, -48]
+       str     F_q, [dstend, -64]
+       str     G_q, [dstend, -48]
 1:
-       ldp     A_l, A_h, [srcend, -32]
-       stp     A_l, A_h, [dstend, -32]
-       ldp     A_l, A_h, [srcend, -16]
-       stp     A_l, A_h, [dstend, -16]
+       str     A_q, [dstin, 16]
+       str     B_q, [dstend, -32]
+       str     C_q, [dstend, -16]
        ret
 
        .p2align 4
@@ -97,44 +105,44 @@ L(copy32):
        /* 16-32 */
        cmp     count, 16
        b.lo    1f
-       ldp     A_l, A_h, [src]
-       stp     A_l, A_h, [dstin]
-       ldp     A_l, A_h, [srcend, -16]
-       stp     A_l, A_h, [dstend, -16]
+       ldr     A_q, [src]
+       ldr     B_q, [srcend, -16]
+       str     A_q, [dstin]
+       str     B_q, [dstend, -16]
        ret
        .p2align 4
 1:
        /* 8-15 */
        tbz     count, 3, 1f
-       ldr     A_l, [src]
-       str     A_l, [dstin]
-       ldr     A_l, [srcend, -8]
-       str     A_l, [dstend, -8]
+       ldr     A_x, [src]
+       ldr     B_x, [srcend, -8]
+       str     A_x, [dstin]
+       str     B_x, [dstend, -8]
        ret
        .p2align 4
 1:
        /* 4-7 */
        tbz     count, 2, 1f
-       ldr     A_lw, [src]
-       str     A_lw, [dstin]
-       ldr     A_lw, [srcend, -4]
-       str     A_lw, [dstend, -4]
+       ldr     A_w, [src]
+       ldr     B_w, [srcend, -4]
+       str     A_w, [dstin]
+       str     B_w, [dstend, -4]
        ret
        .p2align 4
 1:
        /* 2-3 */
        tbz     count, 1, 1f
-       ldrh    A_lw, [src]
-       strh    A_lw, [dstin]
-       ldrh    A_lw, [srcend, -2]
-       strh    A_lw, [dstend, -2]
+       ldrh    A_w, [src]
+       ldrh    B_w, [srcend, -2]
+       strh    A_w, [dstin]
+       strh    B_w, [dstend, -2]
        ret
        .p2align 4
 1:
        /* 0-1 */
        tbz     count, 0, 1f
-       ldrb    A_lw, [src]
-       strb    A_lw, [dstin]
+       ldrb    A_w, [src]
+       strb    A_w, [dstin]
 1:
        ret
 
@@ -153,30 +161,29 @@ L(copy_long):
        add     count, count, tmp1
 
 L(loop64):
-       ldp     A_l, A_h, [src, 16]!
-       stnp    A_l, A_h, [dst, 16]
-       ldp     A_l, A_h, [src, 16]!
+       ldr     A_q, [src, 16]!
+       str     A_q, [dst, 16]
+       ldr     A_q, [src, 16]!
        subs    count, count, 64
-       stnp    A_l, A_h, [dst, 32]
-       ldp     A_l, A_h, [src, 16]!
-       stnp    A_l, A_h, [dst, 48]
-       ldp     A_l, A_h, [src, 16]!
-       stnp    A_l, A_h, [dst, 64]
-       add     dst, dst, 64
+       str     A_q, [dst, 32]
+       ldr     A_q, [src, 16]!
+       str     A_q, [dst, 48]
+       ldr     A_q, [src, 16]!
+       str     A_q, [dst, 64]!
        b.hi    L(loop64)
 
        /* Write the last full set of 64 bytes.  The remainder is at most 64
           bytes, so it is safe to always copy 64 bytes from the end even if
           there is just 1 byte left.  */
 L(last64):
-       ldp     A_l, A_h, [srcend, -64]
-       stnp    A_l, A_h, [dstend, -64]
-       ldp     A_l, A_h, [srcend, -48]
-       stnp    A_l, A_h, [dstend, -48]
-       ldp     A_l, A_h, [srcend, -32]
-       stnp    A_l, A_h, [dstend, -32]
-       ldp     A_l, A_h, [srcend, -16]
-       stnp    A_l, A_h, [dstend, -16]
+       ldr     E_q, [srcend, -64]
+       str     E_q, [dstend, -64]
+       ldr     D_q, [srcend, -48]
+       str     D_q, [dstend, -48]
+       ldr     C_q, [srcend, -32]
+       str     C_q, [dstend, -32]
+       ldr     B_q, [srcend, -16]
+       str     B_q, [dstend, -16]
        ret
 
 END (__memcpy_falkor)

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-09-06 18:43 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-01  0:00 [2.27 COMMITTED][AArch64] Backport memcpy_falkor improvements Wilco Dijkstra
  -- strict thread matches above, loose matches on Subject: below --
2019-01-01  0:00 Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).