[Bug string/28216] New: interleaving load/store instructions in memcpy for aarch64

public inbox for glibc-bugs@sourceware.org
help / color / mirror / Atom feed

From: "wangxuszcn at foxmail dot com" <sourceware-bugzilla@sourceware.org>
To: glibc-bugs@sourceware.org
Subject: [Bug string/28216] New: interleaving load/store instructions in memcpy for aarch64
Date: Tue, 10 Aug 2021 04:28:46 +0000	[thread overview]
Message-ID: <bug-28216-131@http.sourceware.org/bugzilla/> (raw)

https://sourceware.org/bugzilla/show_bug.cgi?id=28216

            Bug ID: 28216
           Summary: interleaving load/store instructions in memcpy for
                    aarch64
           Product: glibc
           Version: unspecified
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P2
         Component: string
          Assignee: unassigned at sourceware dot org
          Reporter: wangxuszcn at foxmail dot com
  Target Milestone: ---

Created attachment 13609
  --> https://sourceware.org/bugzilla/attachment.cgi?id=13609&action=edit
memcpy_interleaving_load_store_instructions

interleaving load and store instructions in memcpy for aarch64

Backgroup Knowledge:
  The Cortex-A57 processor includes separate load and store pipelines, which
allow it to execute one load µop and one store µop every cycle. .

  To achieve maximum throughput for memory copy (or similar loops), one should
do the following.
• Unroll the loop to include multiple load and store operations per iteration,
minimizing the overheads of looping.
• Use discrete, non-writeback forms of load and store instructions (such as
LDRD and STRD), interleaving them so that one load and one store operation may
be performed each cycle. Avoid load-/store-multiple instruction encodings (such
as LDM and STM), which lead to separated bursts of load and store µops which
may not allow concurrent utilization of both the load and store pipelines.

The following example shows a recommended instruction sequence for a long
memory copy in AArch32 state:
Loop_start:
SUBS r2,r2,#64
LDRD r3,r4,[r1,#0]
STRD r3,r4,[r0,#0]
LDRD r3,r4,[r1,#8]
STRD r3,r4,[r0,#8]
LDRD r3,r4,[r1,#16]
STRD r3,r4,[r0,#16]
LDRD r3,r4,[r1,#24]
STRD r3,r4,[r0,#24]
LDRD r3,r4,[r1,#32]
STRD r3,r4,[r0,#32]
LDRD r3,r4,[r1,#40]
STRD r3,r4,[r0,#40]
LDRD r3,r4,[r1,#48]
STRD r3,r4,[r0,#48]
LDRD r3,r4,[r1,#56]
STRD r3,r4,[r0,#56]
ADD r1,r1,#64
ADD r0,r0,#64
BGT Loop_start

A recommended copy routine for AArch64 would look similar to the sequence
above, but would use LDP/STP instructions.

patch:
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 0adc524..0ea4e3e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -88,8 +88,8 @@ ENTRY_ALIGN (MEMCPY, 6)
        cmp     count, 16
        b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
        stp     A_l, A_h, [dstin]
+       ldp     D_l, D_h, [srcend, -16]
        stp     D_l, D_h, [dstend, -16]
        ret

@@ -97,8 +97,8 @@ ENTRY_ALIGN (MEMCPY, 6)
 L(copy16):
        tbz     count, 3, L(copy8)
        ldr     A_l, [src]
-       ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
+       ldr     A_h, [srcend, -8]
        str     A_h, [dstend, -8]
        ret

@@ -107,8 +107,8 @@ L(copy16):
 L(copy8):
        tbz     count, 2, L(copy4)
        ldr     A_lw, [src]
-       ldr     B_lw, [srcend, -4]
        str     A_lw, [dstin]
+       ldr     B_lw, [srcend, -4]
        str     B_lw, [dstend, -4]
        ret

@@ -117,11 +117,11 @@ L(copy4):
        cbz     count, L(copy0)
        lsr     tmp1, count, 1
        ldrb    A_lw, [src]
+       strb    A_lw, [dstin]
        ldrb    C_lw, [srcend, -1]
+       strb    C_lw, [dstend, -1]
        ldrb    B_lw, [src, tmp1]
-       strb    A_lw, [dstin]
        strb    B_lw, [dstin, tmp1]
-       strb    C_lw, [dstend, -1]
 L(copy0):
        ret

@@ -148,8 +148,8 @@ L(copy128):
        cmp     count, 96
        b.ls    L(copy96)
        ldp     G_l, G_h, [srcend, -64]
-       ldp     H_l, H_h, [srcend, -48]
        stp     G_l, G_h, [dstend, -64]
+       ldp     H_l, H_h, [srcend, -48]
        stp     H_l, H_h, [dstend, -48]
 L(copy96):
        stp     A_l, A_h, [dstin]
@@ -224,8 +224,8 @@ ENTRY_ALIGN (MEMMOVE, 4)
        cmp     count, 16
        b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
        stp     A_l, A_h, [dstin]
+       ldp     D_l, D_h, [srcend, -16]
        stp     D_l, D_h, [dstend, -16]
        ret

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 9067ea2..83bae2f 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -74,8 +74,8 @@ L(set_medium):
           32 bytes from the end.  */
 L(set96):
        str     q0, [dstin, 16]
-       stp     q0, q0, [dstin, 32]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstin, 32]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -91,13 +91,13 @@ L(no_zva):
        sub     count, dstend, dst      /* Count is 16 too large.  */
        sub     dst, dst, 16            /* Dst is biased by -32.  */
        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-1:     stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]!
+1:     stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]!
 L(tail64):
        subs    count, count, 64
        b.hi    1b
-2:     stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+2:     stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(try_zva):
@@ -116,10 +116,10 @@ L(try_zva):
         */
 L(zva_64):
        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 32]
        bic     dst, dst, 63
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
        add     dst, dst, 128
@@ -128,10 +128,10 @@ L(zva_64):
        add     dst, dst, 64
        subs    count, count, 64
        b.hi    1b
-       stp     q0, q0, [dst, 0]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dst, 0]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -140,9 +140,9 @@ L(zva_128):
        b.ne    L(zva_other)

        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        bic     dst, dst, 127
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+128   /* Adjust count and bias for loop.  */
@@ -151,10 +151,10 @@ L(zva_128):
        add     dst, dst, 128
        subs    count, count, 128
        b.hi    1b
-       stp     q0, q0, [dstend, -128]
-       stp     q0, q0, [dstend, -96]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstend, -128]
+       stnp    q0, q0, [dstend, -96]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(zva_other):
@@ -170,8 +170,8 @@ L(zva_other):
        subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
        bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
        beq     2f
-1:     stp     q0, q0, [dst], 64
-       stp     q0, q0, [dst, -32]
+1:     stnp    q0, q0, [dst], 64
+       stnp    q0, q0, [dst, -32]
        subs    count, count, 64
        b.hi    1b
 2:     mov     dst, tmp1
diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h
index be60c9a..ad3e487 100644
--- a/sysdeps/nptl/lowlevellock.h
+++ b/sysdeps/nptl/lowlevellock.h
@@ -76,9 +76,9 @@
 #define lll_cond_trylock(lock) \
   __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))

-extern void __lll_lock_wait_private (int *futex);
+extern inline void __lll_lock_wait_private (int *futex)
__attribute__((always_inline));
 libc_hidden_proto (__lll_lock_wait_private)
-extern void __lll_lock_wait (int *futex, int private);
+extern inline void __lll_lock_wait (int *futex, int private)
__attribute__((always_inline));
 libc_hidden_proto (__lll_lock_wait)

 /* This is an expression rather than a statement even though its value is

-- 
You are receiving this mail because:
You are on the CC list for the bug.

next             reply	other threads:[~2021-08-10  4:28 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-10  4:28 wangxuszcn at foxmail dot com [this message]
2021-08-10  8:03 ` [Bug string/28216] " wangxuszcn at foxmail dot com
2022-04-21 14:54 ` wdijkstr at arm dot com

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-28216-131@http.sourceware.org/bugzilla/ \
    --to=sourceware-bugzilla@sourceware.org \
    --cc=glibc-bugs@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).