public inbox for glibc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug string/28214] New: [suggestion] using stnp instead of stp in memset of aarch64
@ 2021-08-10  2:10 wangxuszcn at foxmail dot com
  2022-04-21 14:41 ` [Bug string/28214] " wdijkstr at arm dot com
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: wangxuszcn at foxmail dot com @ 2021-08-10  2:10 UTC (permalink / raw)
  To: glibc-bugs

https://sourceware.org/bugzilla/show_bug.cgi?id=28214

            Bug ID: 28214
           Summary: [suggestion] using stnp instead of stp in memset of
                    aarch64
           Product: glibc
           Version: unspecified
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P2
         Component: string
          Assignee: unassigned at sourceware dot org
          Reporter: wangxuszcn at foxmail dot com
  Target Milestone: ---

Created attachment 13608
  --> https://sourceware.org/bugzilla/attachment.cgi?id=13608&action=edit
memset_change_stp_to_stnp in aarch64

Generally, after the memset is called to perform initialization, the
destination address is not used immediately, suggest that using stnp instead of
stp in memset of aarch64.

Background Knowledge:
The ARM v8-A architecture provides load/store non-temporal pair instructions
(LDNP/STNP) that provide a hint to the memory system that an access is
non-temporal or streaming, and unlikely to be repeated in the near future.


diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 9067ea2..83bae2f 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -74,8 +74,8 @@ L(set_medium):
           32 bytes from the end.  */
 L(set96):
        str     q0, [dstin, 16]
-       stp     q0, q0, [dstin, 32]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstin, 32]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -91,13 +91,13 @@ L(no_zva):
        sub     count, dstend, dst      /* Count is 16 too large.  */
        sub     dst, dst, 16            /* Dst is biased by -32.  */
        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-1:     stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]!
+1:     stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]!
 L(tail64):
        subs    count, count, 64
        b.hi    1b
-2:     stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+2:     stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(try_zva):
@@ -116,10 +116,10 @@ L(try_zva):
         */
 L(zva_64):
        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 32]
        bic     dst, dst, 63
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
        add     dst, dst, 128
@@ -128,10 +128,10 @@ L(zva_64):
        add     dst, dst, 64
        subs    count, count, 64
        b.hi    1b
-       stp     q0, q0, [dst, 0]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dst, 0]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -140,9 +140,9 @@ L(zva_128):
        b.ne    L(zva_other)

        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        bic     dst, dst, 127
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+128   /* Adjust count and bias for loop.  */
@@ -151,10 +151,10 @@ L(zva_128):
        add     dst, dst, 128
        subs    count, count, 128
        b.hi    1b
-       stp     q0, q0, [dstend, -128]
-       stp     q0, q0, [dstend, -96]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstend, -128]
+       stnp    q0, q0, [dstend, -96]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(zva_other):
@@ -170,8 +170,8 @@ L(zva_other):
        subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
        bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
        beq     2f
-1:     stp     q0, q0, [dst], 64
-       stp     q0, q0, [dst, -32]
+1:     stnp    q0, q0, [dst], 64
+       stnp    q0, q0, [dst, -32]
        subs    count, count, 64
        b.hi    1b
 2:     mov     dst, tmp1

-- 
You are receiving this mail because:
You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-02-01 20:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-10  2:10 [Bug string/28214] New: [suggestion] using stnp instead of stp in memset of aarch64 wangxuszcn at foxmail dot com
2022-04-21 14:41 ` [Bug string/28214] " wdijkstr at arm dot com
2022-04-21 14:44 ` wangxuszcn at foxmail dot com
2024-02-01 20:12 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).