public inbox for glibc-bugs@sourceware.org help / color / mirror / Atom feed
From: "wangxuszcn at foxmail dot com" <sourceware-bugzilla@sourceware.org> To: glibc-bugs@sourceware.org Subject: [Bug string/28216] New: interleaving load/store instructions in memcpy for aarch64 Date: Tue, 10 Aug 2021 04:28:46 +0000 [thread overview] Message-ID: <bug-28216-131@http.sourceware.org/bugzilla/> (raw) https://sourceware.org/bugzilla/show_bug.cgi?id=28216 Bug ID: 28216 Summary: interleaving load/store instructions in memcpy for aarch64 Product: glibc Version: unspecified Status: UNCONFIRMED Severity: enhancement Priority: P2 Component: string Assignee: unassigned at sourceware dot org Reporter: wangxuszcn at foxmail dot com Target Milestone: --- Created attachment 13609 --> https://sourceware.org/bugzilla/attachment.cgi?id=13609&action=edit memcpy_interleaving_load_store_instructions interleaving load and store instructions in memcpy for aarch64 Backgroup Knowledge: The Cortex-A57 processor includes separate load and store pipelines, which allow it to execute one load µop and one store µop every cycle. . To achieve maximum throughput for memory copy (or similar loops), one should do the following. • Unroll the loop to include multiple load and store operations per iteration, minimizing the overheads of looping. • Use discrete, non-writeback forms of load and store instructions (such as LDRD and STRD), interleaving them so that one load and one store operation may be performed each cycle. Avoid load-/store-multiple instruction encodings (such as LDM and STM), which lead to separated bursts of load and store µops which may not allow concurrent utilization of both the load and store pipelines. The following example shows a recommended instruction sequence for a long memory copy in AArch32 state: Loop_start: SUBS r2,r2,#64 LDRD r3,r4,[r1,#0] STRD r3,r4,[r0,#0] LDRD r3,r4,[r1,#8] STRD r3,r4,[r0,#8] LDRD r3,r4,[r1,#16] STRD r3,r4,[r0,#16] LDRD r3,r4,[r1,#24] STRD r3,r4,[r0,#24] LDRD r3,r4,[r1,#32] STRD r3,r4,[r0,#32] LDRD r3,r4,[r1,#40] STRD r3,r4,[r0,#40] LDRD r3,r4,[r1,#48] STRD r3,r4,[r0,#48] LDRD r3,r4,[r1,#56] STRD r3,r4,[r0,#56] ADD r1,r1,#64 ADD r0,r0,#64 BGT Loop_start A recommended copy routine for AArch64 would look similar to the sequence above, but would use LDP/STP instructions. patch: diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index 0adc524..0ea4e3e 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -88,8 +88,8 @@ ENTRY_ALIGN (MEMCPY, 6) cmp count, 16 b.lo L(copy16) ldp A_l, A_h, [src] - ldp D_l, D_h, [srcend, -16] stp A_l, A_h, [dstin] + ldp D_l, D_h, [srcend, -16] stp D_l, D_h, [dstend, -16] ret @@ -97,8 +97,8 @@ ENTRY_ALIGN (MEMCPY, 6) L(copy16): tbz count, 3, L(copy8) ldr A_l, [src] - ldr A_h, [srcend, -8] str A_l, [dstin] + ldr A_h, [srcend, -8] str A_h, [dstend, -8] ret @@ -107,8 +107,8 @@ L(copy16): L(copy8): tbz count, 2, L(copy4) ldr A_lw, [src] - ldr B_lw, [srcend, -4] str A_lw, [dstin] + ldr B_lw, [srcend, -4] str B_lw, [dstend, -4] ret @@ -117,11 +117,11 @@ L(copy4): cbz count, L(copy0) lsr tmp1, count, 1 ldrb A_lw, [src] + strb A_lw, [dstin] ldrb C_lw, [srcend, -1] + strb C_lw, [dstend, -1] ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] strb B_lw, [dstin, tmp1] - strb C_lw, [dstend, -1] L(copy0): ret @@ -148,8 +148,8 @@ L(copy128): cmp count, 96 b.ls L(copy96) ldp G_l, G_h, [srcend, -64] - ldp H_l, H_h, [srcend, -48] stp G_l, G_h, [dstend, -64] + ldp H_l, H_h, [srcend, -48] stp H_l, H_h, [dstend, -48] L(copy96): stp A_l, A_h, [dstin] @@ -224,8 +224,8 @@ ENTRY_ALIGN (MEMMOVE, 4) cmp count, 16 b.lo L(copy16) ldp A_l, A_h, [src] - ldp D_l, D_h, [srcend, -16] stp A_l, A_h, [dstin] + ldp D_l, D_h, [srcend, -16] stp D_l, D_h, [dstend, -16] ret diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 9067ea2..83bae2f 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -74,8 +74,8 @@ L(set_medium): 32 bytes from the end. */ L(set96): str q0, [dstin, 16] - stp q0, q0, [dstin, 32] - stp q0, q0, [dstend, -32] + stnp q0, q0, [dstin, 32] + stnp q0, q0, [dstend, -32] ret .p2align 3 @@ -91,13 +91,13 @@ L(no_zva): sub count, dstend, dst /* Count is 16 too large. */ sub dst, dst, 16 /* Dst is biased by -32. */ sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64]! +1: stnp q0, q0, [dst, 32] + stnp q0, q0, [dst, 64]! L(tail64): subs count, count, 64 b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] +2: stnp q0, q0, [dstend, -64] + stnp q0, q0, [dstend, -32] ret L(try_zva): @@ -116,10 +116,10 @@ L(try_zva): */ L(zva_64): str q0, [dst, 16] - stp q0, q0, [dst, 32] + stnp q0, q0, [dst, 32] bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] + stnp q0, q0, [dst, 64] + stnp q0, q0, [dst, 96] sub count, dstend, dst /* Count is now 128 too large. */ sub count, count, 128+64+64 /* Adjust count and bias for loop. */ add dst, dst, 128 @@ -128,10 +128,10 @@ L(zva_64): add dst, dst, 64 subs count, count, 64 b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] + stnp q0, q0, [dst, 0] + stnp q0, q0, [dst, 32] + stnp q0, q0, [dstend, -64] + stnp q0, q0, [dstend, -32] ret .p2align 3 @@ -140,9 +140,9 @@ L(zva_128): b.ne L(zva_other) str q0, [dst, 16] - stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] + stnp q0, q0, [dst, 32] + stnp q0, q0, [dst, 64] + stnp q0, q0, [dst, 96] bic dst, dst, 127 sub count, dstend, dst /* Count is now 128 too large. */ sub count, count, 128+128 /* Adjust count and bias for loop. */ @@ -151,10 +151,10 @@ L(zva_128): add dst, dst, 128 subs count, count, 128 b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] + stnp q0, q0, [dstend, -128] + stnp q0, q0, [dstend, -96] + stnp q0, q0, [dstend, -64] + stnp q0, q0, [dstend, -32] ret L(zva_other): @@ -170,8 +170,8 @@ L(zva_other): subs count, tmp1, dst /* Actual alignment bytes to write. */ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] +1: stnp q0, q0, [dst], 64 + stnp q0, q0, [dst, -32] subs count, count, 64 b.hi 1b 2: mov dst, tmp1 diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h index be60c9a..ad3e487 100644 --- a/sysdeps/nptl/lowlevellock.h +++ b/sysdeps/nptl/lowlevellock.h @@ -76,9 +76,9 @@ #define lll_cond_trylock(lock) \ __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0)) -extern void __lll_lock_wait_private (int *futex); +extern inline void __lll_lock_wait_private (int *futex) __attribute__((always_inline)); libc_hidden_proto (__lll_lock_wait_private) -extern void __lll_lock_wait (int *futex, int private); +extern inline void __lll_lock_wait (int *futex, int private) __attribute__((always_inline)); libc_hidden_proto (__lll_lock_wait) /* This is an expression rather than a statement even though its value is -- You are receiving this mail because: You are on the CC list for the bug.
next reply other threads:[~2021-08-10 4:28 UTC|newest] Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top 2021-08-10 4:28 wangxuszcn at foxmail dot com [this message] 2021-08-10 8:03 ` [Bug string/28216] " wangxuszcn at foxmail dot com 2022-04-21 14:54 ` wdijkstr at arm dot com
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=bug-28216-131@http.sourceware.org/bugzilla/ \ --to=sourceware-bugzilla@sourceware.org \ --cc=glibc-bugs@sourceware.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).