[Bug string/28216] New: interleaving load/store instructions in memcpy for aarch64

public inbox for glibc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug string/28216] New: interleaving load/store instructions in memcpy for aarch64
@ 2021-08-10  4:28 wangxuszcn at foxmail dot com
  2021-08-10  8:03 ` [Bug string/28216] " wangxuszcn at foxmail dot com
  2022-04-21 14:54 ` wdijkstr at arm dot com
  0 siblings, 2 replies; 3+ messages in thread
From: wangxuszcn at foxmail dot com @ 2021-08-10  4:28 UTC (permalink / raw)
  To: glibc-bugs

https://sourceware.org/bugzilla/show_bug.cgi?id=28216

            Bug ID: 28216
           Summary: interleaving load/store instructions in memcpy for
                    aarch64
           Product: glibc
           Version: unspecified
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P2
         Component: string
          Assignee: unassigned at sourceware dot org
          Reporter: wangxuszcn at foxmail dot com
  Target Milestone: ---

Created attachment 13609
  --> https://sourceware.org/bugzilla/attachment.cgi?id=13609&action=edit
memcpy_interleaving_load_store_instructions

interleaving load and store instructions in memcpy for aarch64

Backgroup Knowledge:
  The Cortex-A57 processor includes separate load and store pipelines, which
allow it to execute one load µop and one store µop every cycle. .

  To achieve maximum throughput for memory copy (or similar loops), one should
do the following.
• Unroll the loop to include multiple load and store operations per iteration,
minimizing the overheads of looping.
• Use discrete, non-writeback forms of load and store instructions (such as
LDRD and STRD), interleaving them so that one load and one store operation may
be performed each cycle. Avoid load-/store-multiple instruction encodings (such
as LDM and STM), which lead to separated bursts of load and store µops which
may not allow concurrent utilization of both the load and store pipelines.

The following example shows a recommended instruction sequence for a long
memory copy in AArch32 state:
Loop_start:
SUBS r2,r2,#64
LDRD r3,r4,[r1,#0]
STRD r3,r4,[r0,#0]
LDRD r3,r4,[r1,#8]
STRD r3,r4,[r0,#8]
LDRD r3,r4,[r1,#16]
STRD r3,r4,[r0,#16]
LDRD r3,r4,[r1,#24]
STRD r3,r4,[r0,#24]
LDRD r3,r4,[r1,#32]
STRD r3,r4,[r0,#32]
LDRD r3,r4,[r1,#40]
STRD r3,r4,[r0,#40]
LDRD r3,r4,[r1,#48]
STRD r3,r4,[r0,#48]
LDRD r3,r4,[r1,#56]
STRD r3,r4,[r0,#56]
ADD r1,r1,#64
ADD r0,r0,#64
BGT Loop_start

A recommended copy routine for AArch64 would look similar to the sequence
above, but would use LDP/STP instructions.

patch:
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 0adc524..0ea4e3e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -88,8 +88,8 @@ ENTRY_ALIGN (MEMCPY, 6)
        cmp     count, 16
        b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
        stp     A_l, A_h, [dstin]
+       ldp     D_l, D_h, [srcend, -16]
        stp     D_l, D_h, [dstend, -16]
        ret

@@ -97,8 +97,8 @@ ENTRY_ALIGN (MEMCPY, 6)
 L(copy16):
        tbz     count, 3, L(copy8)
        ldr     A_l, [src]
-       ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
+       ldr     A_h, [srcend, -8]
        str     A_h, [dstend, -8]
        ret

@@ -107,8 +107,8 @@ L(copy16):
 L(copy8):
        tbz     count, 2, L(copy4)
        ldr     A_lw, [src]
-       ldr     B_lw, [srcend, -4]
        str     A_lw, [dstin]
+       ldr     B_lw, [srcend, -4]
        str     B_lw, [dstend, -4]
        ret

@@ -117,11 +117,11 @@ L(copy4):
        cbz     count, L(copy0)
        lsr     tmp1, count, 1
        ldrb    A_lw, [src]
+       strb    A_lw, [dstin]
        ldrb    C_lw, [srcend, -1]
+       strb    C_lw, [dstend, -1]
        ldrb    B_lw, [src, tmp1]
-       strb    A_lw, [dstin]
        strb    B_lw, [dstin, tmp1]
-       strb    C_lw, [dstend, -1]
 L(copy0):
        ret

@@ -148,8 +148,8 @@ L(copy128):
        cmp     count, 96
        b.ls    L(copy96)
        ldp     G_l, G_h, [srcend, -64]
-       ldp     H_l, H_h, [srcend, -48]
        stp     G_l, G_h, [dstend, -64]
+       ldp     H_l, H_h, [srcend, -48]
        stp     H_l, H_h, [dstend, -48]
 L(copy96):
        stp     A_l, A_h, [dstin]
@@ -224,8 +224,8 @@ ENTRY_ALIGN (MEMMOVE, 4)
        cmp     count, 16
        b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
        stp     A_l, A_h, [dstin]
+       ldp     D_l, D_h, [srcend, -16]
        stp     D_l, D_h, [dstend, -16]
        ret

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 9067ea2..83bae2f 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -74,8 +74,8 @@ L(set_medium):
           32 bytes from the end.  */
 L(set96):
        str     q0, [dstin, 16]
-       stp     q0, q0, [dstin, 32]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstin, 32]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -91,13 +91,13 @@ L(no_zva):
        sub     count, dstend, dst      /* Count is 16 too large.  */
        sub     dst, dst, 16            /* Dst is biased by -32.  */
        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-1:     stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]!
+1:     stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]!
 L(tail64):
        subs    count, count, 64
        b.hi    1b
-2:     stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+2:     stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(try_zva):
@@ -116,10 +116,10 @@ L(try_zva):
         */
 L(zva_64):
        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 32]
        bic     dst, dst, 63
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
        add     dst, dst, 128
@@ -128,10 +128,10 @@ L(zva_64):
        add     dst, dst, 64
        subs    count, count, 64
        b.hi    1b
-       stp     q0, q0, [dst, 0]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dst, 0]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -140,9 +140,9 @@ L(zva_128):
        b.ne    L(zva_other)

        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        bic     dst, dst, 127
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+128   /* Adjust count and bias for loop.  */
@@ -151,10 +151,10 @@ L(zva_128):
        add     dst, dst, 128
        subs    count, count, 128
        b.hi    1b
-       stp     q0, q0, [dstend, -128]
-       stp     q0, q0, [dstend, -96]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstend, -128]
+       stnp    q0, q0, [dstend, -96]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(zva_other):
@@ -170,8 +170,8 @@ L(zva_other):
        subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
        bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
        beq     2f
-1:     stp     q0, q0, [dst], 64
-       stp     q0, q0, [dst, -32]
+1:     stnp    q0, q0, [dst], 64
+       stnp    q0, q0, [dst, -32]
        subs    count, count, 64
        b.hi    1b
 2:     mov     dst, tmp1
diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h
index be60c9a..ad3e487 100644
--- a/sysdeps/nptl/lowlevellock.h
+++ b/sysdeps/nptl/lowlevellock.h
@@ -76,9 +76,9 @@
 #define lll_cond_trylock(lock) \
   __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))

-extern void __lll_lock_wait_private (int *futex);
+extern inline void __lll_lock_wait_private (int *futex)
__attribute__((always_inline));
 libc_hidden_proto (__lll_lock_wait_private)
-extern void __lll_lock_wait (int *futex, int private);
+extern inline void __lll_lock_wait (int *futex, int private)
__attribute__((always_inline));
 libc_hidden_proto (__lll_lock_wait)

 /* This is an expression rather than a statement even though its value is

-- 
You are receiving this mail because:
You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Bug string/28216] interleaving load/store instructions in memcpy for aarch64
  2021-08-10  4:28 [Bug string/28216] New: interleaving load/store instructions in memcpy for aarch64 wangxuszcn at foxmail dot com
@ 2021-08-10  8:03 ` wangxuszcn at foxmail dot com
  2022-04-21 14:54 ` wdijkstr at arm dot com
  1 sibling, 0 replies; 3+ messages in thread
From: wangxuszcn at foxmail dot com @ 2021-08-10  8:03 UTC (permalink / raw)
  To: glibc-bugs

https://sourceware.org/bugzilla/show_bug.cgi?id=28216

--- Comment #1 from wangxu <wangxuszcn at foxmail dot com> ---
Created attachment 13610
  --> https://sourceware.org/bugzilla/attachment.cgi?id=13610&action=edit
memcpy_interleaving_load_store_instructions

-- 
You are receiving this mail because:
You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Bug string/28216] interleaving load/store instructions in memcpy for aarch64
  2021-08-10  4:28 [Bug string/28216] New: interleaving load/store instructions in memcpy for aarch64 wangxuszcn at foxmail dot com
  2021-08-10  8:03 ` [Bug string/28216] " wangxuszcn at foxmail dot com
@ 2022-04-21 14:54 ` wdijkstr at arm dot com
  1 sibling, 0 replies; 3+ messages in thread
From: wdijkstr at arm dot com @ 2022-04-21 14:54 UTC (permalink / raw)
  To: glibc-bugs

https://sourceware.org/bugzilla/show_bug.cgi?id=28216

Wilco <wdijkstr at arm dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |wdijkstr at arm dot com

--- Comment #2 from Wilco <wdijkstr at arm dot com> ---
Memcpy shares code with memmove for sizes up to 128 bytes, so it is essential
to do all loads before any store. Finegrained interleaving of loads/stores
isn't needed on out-of-order cores - all you want to avoid is extremes like
doing 32 loads and then 32 stores.

-- 
You are receiving this mail because:
You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-04-21 14:54 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-10  4:28 [Bug string/28216] New: interleaving load/store instructions in memcpy for aarch64 wangxuszcn at foxmail dot com
2021-08-10  8:03 ` [Bug string/28216] " wangxuszcn at foxmail dot com
2022-04-21 14:54 ` wdijkstr at arm dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).