public inbox for glibc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug string/28214] New: [suggestion] using stnp instead of stp in memset of aarch64
@ 2021-08-10  2:10 wangxuszcn at foxmail dot com
  2022-04-21 14:41 ` [Bug string/28214] " wdijkstr at arm dot com
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: wangxuszcn at foxmail dot com @ 2021-08-10  2:10 UTC (permalink / raw)
  To: glibc-bugs

https://sourceware.org/bugzilla/show_bug.cgi?id=28214

            Bug ID: 28214
           Summary: [suggestion] using stnp instead of stp in memset of
                    aarch64
           Product: glibc
           Version: unspecified
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P2
         Component: string
          Assignee: unassigned at sourceware dot org
          Reporter: wangxuszcn at foxmail dot com
  Target Milestone: ---

Created attachment 13608
  --> https://sourceware.org/bugzilla/attachment.cgi?id=13608&action=edit
memset_change_stp_to_stnp in aarch64

Generally, after the memset is called to perform initialization, the
destination address is not used immediately, suggest that using stnp instead of
stp in memset of aarch64.

Background Knowledge:
The ARM v8-A architecture provides load/store non-temporal pair instructions
(LDNP/STNP) that provide a hint to the memory system that an access is
non-temporal or streaming, and unlikely to be repeated in the near future.


diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 9067ea2..83bae2f 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -74,8 +74,8 @@ L(set_medium):
           32 bytes from the end.  */
 L(set96):
        str     q0, [dstin, 16]
-       stp     q0, q0, [dstin, 32]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstin, 32]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -91,13 +91,13 @@ L(no_zva):
        sub     count, dstend, dst      /* Count is 16 too large.  */
        sub     dst, dst, 16            /* Dst is biased by -32.  */
        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-1:     stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]!
+1:     stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]!
 L(tail64):
        subs    count, count, 64
        b.hi    1b
-2:     stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+2:     stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(try_zva):
@@ -116,10 +116,10 @@ L(try_zva):
         */
 L(zva_64):
        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 32]
        bic     dst, dst, 63
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
        add     dst, dst, 128
@@ -128,10 +128,10 @@ L(zva_64):
        add     dst, dst, 64
        subs    count, count, 64
        b.hi    1b
-       stp     q0, q0, [dst, 0]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dst, 0]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

        .p2align 3
@@ -140,9 +140,9 @@ L(zva_128):
        b.ne    L(zva_other)

        str     q0, [dst, 16]
-       stp     q0, q0, [dst, 32]
-       stp     q0, q0, [dst, 64]
-       stp     q0, q0, [dst, 96]
+       stnp    q0, q0, [dst, 32]
+       stnp    q0, q0, [dst, 64]
+       stnp    q0, q0, [dst, 96]
        bic     dst, dst, 127
        sub     count, dstend, dst      /* Count is now 128 too large.  */
        sub     count, count, 128+128   /* Adjust count and bias for loop.  */
@@ -151,10 +151,10 @@ L(zva_128):
        add     dst, dst, 128
        subs    count, count, 128
        b.hi    1b
-       stp     q0, q0, [dstend, -128]
-       stp     q0, q0, [dstend, -96]
-       stp     q0, q0, [dstend, -64]
-       stp     q0, q0, [dstend, -32]
+       stnp    q0, q0, [dstend, -128]
+       stnp    q0, q0, [dstend, -96]
+       stnp    q0, q0, [dstend, -64]
+       stnp    q0, q0, [dstend, -32]
        ret

 L(zva_other):
@@ -170,8 +170,8 @@ L(zva_other):
        subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
        bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
        beq     2f
-1:     stp     q0, q0, [dst], 64
-       stp     q0, q0, [dst, -32]
+1:     stnp    q0, q0, [dst], 64
+       stnp    q0, q0, [dst, -32]
        subs    count, count, 64
        b.hi    1b
 2:     mov     dst, tmp1

-- 
You are receiving this mail because:
You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug string/28214] [suggestion] using stnp instead of stp in memset of aarch64
  2021-08-10  2:10 [Bug string/28214] New: [suggestion] using stnp instead of stp in memset of aarch64 wangxuszcn at foxmail dot com
@ 2022-04-21 14:41 ` wdijkstr at arm dot com
  2022-04-21 14:44 ` wangxuszcn at foxmail dot com
  2024-02-01 20:12 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: wdijkstr at arm dot com @ 2022-04-21 14:41 UTC (permalink / raw)
  To: glibc-bugs

https://sourceware.org/bugzilla/show_bug.cgi?id=28214

Wilco <wdijkstr at arm dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |wdijkstr at arm dot com

--- Comment #1 from Wilco <wdijkstr at arm dot com> ---
Doing this is only worthwhile if there is a clear performance gain on some
microarchitectures. For small memsets it is very likely the memory is
immediately reused, so using STNP is wrong. Large memsets of zero use DC ZVA
which results in write streaming on most implementations, so large non-zero
memsets is the only case where STNP might be beneficial.

-- 
You are receiving this mail because:
You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug string/28214] [suggestion] using stnp instead of stp in memset of aarch64
  2021-08-10  2:10 [Bug string/28214] New: [suggestion] using stnp instead of stp in memset of aarch64 wangxuszcn at foxmail dot com
  2022-04-21 14:41 ` [Bug string/28214] " wdijkstr at arm dot com
@ 2022-04-21 14:44 ` wangxuszcn at foxmail dot com
  2024-02-01 20:12 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: wangxuszcn at foxmail dot com @ 2022-04-21 14:44 UTC (permalink / raw)
  To: glibc-bugs

https://sourceware.org/bugzilla/show_bug.cgi?id=28214

--- Comment #2 from wangxu <wangxuszcn at foxmail dot com> ---
thanks for your kindly explanation!



---Original---
From: "wdijkstr at arm dot com"<sourceware-bugzilla@sourceware.org&gt;
Date: Thu, Apr 21, 2022 22:41 PM
To: "wangxuszcn"<wangxuszcn@foxmail.com&gt;;
Subject: [Bug string/28214] [suggestion] using stnp instead of stp in memsetof
aarch64


https://sourceware.org/bugzilla/show_bug.cgi?id=28214

Wilco <wdijkstr at arm dot com&gt; changed:

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
What&nbsp;&nbsp;&nbsp;
|Removed&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
|Added
----------------------------------------------------------------------------
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
CC|&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
|wdijkstr at arm dot com

--- Comment #1 from Wilco <wdijkstr at arm dot com&gt; ---
Doing this is only worthwhile if there is a clear performance gain on some
microarchitectures. For small memsets it is very likely the memory is
immediately reused, so using STNP is wrong. Large memsets of zero use DC ZVA
which results in write streaming on most implementations, so large non-zero
memsets is the only case where STNP might be beneficial.

-- 
You are receiving this mail because:
You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug string/28214] [suggestion] using stnp instead of stp in memset of aarch64
  2021-08-10  2:10 [Bug string/28214] New: [suggestion] using stnp instead of stp in memset of aarch64 wangxuszcn at foxmail dot com
  2022-04-21 14:41 ` [Bug string/28214] " wdijkstr at arm dot com
  2022-04-21 14:44 ` wangxuszcn at foxmail dot com
@ 2024-02-01 20:12 ` pinskia at gcc dot gnu.org
  2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-02-01 20:12 UTC (permalink / raw)
  To: glibc-bugs

https://sourceware.org/bugzilla/show_bug.cgi?id=28214

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |pinskia at gcc dot gnu.org

-- 
You are receiving this mail because:
You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-02-01 20:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-10  2:10 [Bug string/28214] New: [suggestion] using stnp instead of stp in memset of aarch64 wangxuszcn at foxmail dot com
2022-04-21 14:41 ` [Bug string/28214] " wdijkstr at arm dot com
2022-04-21 14:44 ` wangxuszcn at foxmail dot com
2024-02-01 20:12 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).