From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <sourceware-bugzilla@sourceware.org>
Received: by sourceware.org (Postfix, from userid 48)
 id E8A3539450CE; Tue, 10 Aug 2021 04:28:46 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E8A3539450CE
From: "wangxuszcn at foxmail dot com" <sourceware-bugzilla@sourceware.org>
To: glibc-bugs@sourceware.org
Subject: [Bug string/28216] New: interleaving load/store instructions in
 memcpy for aarch64
Date: Tue, 10 Aug 2021 04:28:46 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: glibc
X-Bugzilla-Component: string
X-Bugzilla-Version: unspecified
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: enhancement
X-Bugzilla-Who: wangxuszcn at foxmail dot com
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P2
X-Bugzilla-Assigned-To: unassigned at sourceware dot org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status
 bug_severity priority component assigned_to reporter target_milestone
 attachments.created
Message-ID: <bug-28216-131@http.sourceware.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://sourceware.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-BeenThere: glibc-bugs@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Glibc-bugs mailing list <glibc-bugs.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/glibc-bugs>,
 <mailto:glibc-bugs-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/glibc-bugs/>
List-Help: <mailto:glibc-bugs-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/glibc-bugs>,
 <mailto:glibc-bugs-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Tue, 10 Aug 2021 04:28:47 -0000

https://sourceware.org/bugzilla/show_bug.cgi?id=3D28216

            Bug ID: 28216
           Summary: interleaving load/store instructions in memcpy for
                    aarch64
           Product: glibc
           Version: unspecified
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P2
         Component: string
          Assignee: unassigned at sourceware dot org
          Reporter: wangxuszcn at foxmail dot com
  Target Milestone: ---

Created attachment 13609
  --> https://sourceware.org/bugzilla/attachment.cgi?id=3D13609&action=3Ded=
it
memcpy_interleaving_load_store_instructions

interleaving load and store instructions in memcpy for aarch64

Backgroup Knowledge:
  The Cortex-A57 processor includes separate load and store pipelines, which
allow it to execute one load =C2=B5op and one store =C2=B5op every cycle. .

  To achieve maximum throughput for memory copy (or similar loops), one sho=
uld
do the following.
=E2=80=A2 Unroll the loop to include multiple load and store operations per=
 iteration,
minimizing the overheads of looping.
=E2=80=A2 Use discrete, non-writeback forms of load and store instructions =
(such as
LDRD and STRD), interleaving them so that one load and one store operation =
may
be performed each cycle. Avoid load-/store-multiple instruction encodings (=
such
as LDM and STM), which lead to separated bursts of load and store =C2=B5ops=
 which
may not allow concurrent utilization of both the load and store pipelines.

The following example shows a recommended instruction sequence for a long
memory copy in AArch32 state:
Loop_start:
SUBS r2,r2,#64
LDRD r3,r4,[r1,#0]
STRD r3,r4,[r0,#0]
LDRD r3,r4,[r1,#8]
STRD r3,r4,[r0,#8]
LDRD r3,r4,[r1,#16]
STRD r3,r4,[r0,#16]
LDRD r3,r4,[r1,#24]
STRD r3,r4,[r0,#24]
LDRD r3,r4,[r1,#32]
STRD r3,r4,[r0,#32]
LDRD r3,r4,[r1,#40]
STRD r3,r4,[r0,#40]
LDRD r3,r4,[r1,#48]
STRD r3,r4,[r0,#48]
LDRD r3,r4,[r1,#56]
STRD r3,r4,[r0,#56]
ADD r1,r1,#64
ADD r0,r0,#64
BGT Loop_start

A recommended copy routine for AArch64 would look similar to the sequence
above, but would use LDP/STP instructions.

patch:
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 0adc524..0ea4e3e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -88,8 +88,8 @@ ENTRY_ALIGN (MEMCPY, 6)
        cmp     count, 16
        b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
        stp     A_l, A_h, [dstin]
+       ldp     D_l, D_h, [srcend, -16]
        stp     D_l, D_h, [dstend, -16]
        ret

@@ -97,8 +97,8 @@ ENTRY_ALIGN (MEMCPY, 6)
 L(copy16):
        tbz     count, 3, L(copy8)
        ldr     A_l, [src]
-       ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
+       ldr     A_h, [srcend, -8]
        str     A_h, [dstend, -8]
        ret

@@ -107,8 +107,8 @@ L(copy16):
 L(copy8):
        tbz     count, 2, L(copy4)
        ldr     A_lw, [src]
-       ldr     B_lw, [srcend, -4]
        str     A_lw, [dstin]
+       ldr     B_lw, [srcend, -4]
        str     B_lw, [dstend, -4]
        ret

@@ -117,11 +117,11 @@ L(copy4):
        cbz     count, L(copy0)
        lsr     tmp1, count, 1
        ldrb    A_lw, [src]
+       strb    A_lw, [dstin]
        ldrb    C_lw, [srcend, -1]
+       strb    C_lw, [dstend, -1]
        ldrb    B_lw, [src, tmp1]
-       strb    A_lw, [dstin]
        strb    B_lw, [dstin, tmp1]
-       strb    C_lw, [dstend, -1]
 L(copy0):
        ret

@@ -148,8 +148,8 @@ L(copy128):
        cmp     count, 96
        b.ls    L(copy96)
        ldp     G_l, G_h, [srcend, -64]
-       ldp     H_l, H_h, [srcend, -48]
        stp     G_l, G_h, [dstend, -64]
+       ldp     H_l, H_h, [srcend, -48]
        stp     H_l, H_h, [dstend, -48]
 L(copy96):
        stp     A_l, A_h, [dstin]
@@ -224,8 +224,8 @@ ENTRY_ALIGN (MEMMOVE, 4)
        cmp     count, 16
        b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
        stp     A_l, A_h, [dstin]
+       ldp     D_l, D_h, [srcend, -16]
        stp     D_l, D_h, [dstend, -16]
        ret

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 9067ea2..83bae2f 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -74,8 +74,8 @@ L(set_medium):
           32 bytes from the end.  */
 L(set96):
        str     q0, [dstin, 16]
-       stp     q0, q0, [dstin, 32]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstin, 32]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -91,13 +91,13 @@ L(no_zva):
        sub     count, dstend, dst      /* Count is 16 too large.  */
        sub     dst, dst, 16            /* Dst is biased by -32.  */
        sub     count, count, 64 + 16   /* Adjust count and bias for loop. =
 */
-1:     stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]!
+1:     stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]!
 L(tail64):
        subs    count, count, 64
        b.hi    1b
-2:     stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+2:     stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(try_zva):
@@ -116,10 +116,10 @@ L(try_zva):
         */
 L(zva_64):
        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 32]
        bic     dst, dst, 63
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+64+64 /* Adjust count and bias for loop. =
 */
        add     dst, dst, 128
@@ -128,10 +128,10 @@ L(zva_64):
        add     dst, dst, 64
        subs    count, count, 64
        b.hi    1b
-       stp     q0, q0, [dst, 0]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dst, 0]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -140,9 +140,9 @@ L(zva_128):
        b.ne    L(zva_other)

        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        bic     dst, dst, 127
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+128   /* Adjust count and bias for loop. =
 */
@@ -151,10 +151,10 @@ L(zva_128):
        add     dst, dst, 128
        subs    count, count, 128
        b.hi    1b
-       stp     q0, q0, [dstend, -128]
-       stp     q0, q0, [dstend, -96]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstend, -128]
+       stnp    q0, q0, [dstend, -96]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(zva_other):
@@ -170,8 +170,8 @@ L(zva_other):
        subs    count, tmp1, dst        /* Actual alignment bytes to write.=
  */
        bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
        beq     2f
-1:     stp     q0, q0, [dst], 64
-       stp     q0, q0, [dst, -32]
+1:     stnp    q0, q0, [dst], 64
+       stnp    q0, q0, [dst, -32]
        subs    count, count, 64
        b.hi    1b
 2:     mov     dst, tmp1
diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h
index be60c9a..ad3e487 100644
--- a/sysdeps/nptl/lowlevellock.h
+++ b/sysdeps/nptl/lowlevellock.h
@@ -76,9 +76,9 @@
 #define lll_cond_trylock(lock) \
   __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))

-extern void __lll_lock_wait_private (int *futex);
+extern inline void __lll_lock_wait_private (int *futex)
__attribute__((always_inline));
 libc_hidden_proto (__lll_lock_wait_private)
-extern void __lll_lock_wait (int *futex, int private);
+extern inline void __lll_lock_wait (int *futex, int private)
__attribute__((always_inline));
 libc_hidden_proto (__lll_lock_wait)

 /* This is an expression rather than a statement even though its value is

--=20
You are receiving this mail because:
You are on the CC list for the bug.=