public inbox for libc-stable@sourceware.org
 help / color / mirror / Atom feed
* [2.26 COMMITTED][AArch64] Backport memcmp improvements
@ 2019-01-01  0:00 Wilco Dijkstra
  0 siblings, 0 replies; only message in thread
From: Wilco Dijkstra @ 2019-01-01  0:00 UTC (permalink / raw)
  To: libc-stable; +Cc: nd

commit ec4512194f035856b8a231476c9139d72f47c58f
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Tue Mar 6 19:22:39 2018 +0530

    aarch64: Optimized memcmp for medium to large sizes
    
    This improved memcmp provides a fast path for compares up to 16 bytes
    and then compares 16 bytes at a time, thus optimizing loads from both
    sources.  The glibc memcmp microbenchmark retains performance (with an
    error of ~1ns) for smaller compare sizes and reduces up to 31% of
    execution time for compares up to 4K on the APM Mustang.  On Qualcomm
    Falkor this improves to almost 48%, i.e. it is almost 2x improvement
    for sizes of 2K and above.
    
        * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
        time.
    
    (cherry picked from commit 30a81dae5b752f8aa5f96e7f7c341ec57cba3585)

commit 600e4e866c4de0cc0b16aec482c65da732960367
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Fri Feb 2 10:15:20 2018 +0530

    aarch64: Use the L() macro for labels in memcmp
    
    The L() macro makes the assembly a bit more readable.
    
        * sysdeps/aarch64/memcmp.S: Use L() macro for labels.
    
    (cherry picked from commit 84c94d2fd90d84ae7e67657ee8e22c2d1b796f63)

commit 1896de3d926d299a1ed5c9f0a4f03f5a81969200
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date:   Thu Aug 10 17:00:38 2017 +0100

    [AArch64] Optimized memcmp.
    
    This is an optimized memcmp for AArch64.  This is a complete rewrite
    using a different algorithm.  The previous version split into cases
    where both inputs were aligned, the inputs were mutually aligned and
    unaligned using a byte loop.  The new version combines all these cases,
    while small inputs of less than 8 bytes are handled separately.
    
    This allows the main code to be sped up using unaligned loads since
    there are now at least 8 bytes to be compared.  After the first 8 bytes,
    align the first input.  This ensures each iteration does at most one
    unaligned access and mutually aligned inputs behave as aligned.
    After the main loop, process the last 8 bytes using unaligned accesses.
    
    This improves performance of (mutually) aligned cases by 25% and
    unaligned by >500% (yes >6 times faster) on large inputs.
    
        * sysdeps/aarch64/memcmp.S (memcmp):
        Rewrite of optimized memcmp.
    
    (cherry picked from commit 922369032c604b4dcfd535e1bcddd4687e7126a5)


diff --git a/ChangeLog b/ChangeLog
index 4cd747a..204d047 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+       * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
+       time.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+       * sysdeps/aarch64/memcmp.S: Use L() macro for labels.
+
+2019-09-06  Wilco Dijkstra  <wdijkstr@arm.com>
+
+       * sysdeps/aarch64/memcmp.S (memcmp):
+       Rewrite of optimized memcmp.
+
 2019-07-12  Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 
        [BZ #24699]
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 4cfcb89..d074c98 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -22,132 +22,132 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
 /* Parameters and result.  */
 #define src1           x0
 #define src2           x1
 #define limit          x2
-#define result         x0
+#define result         w0
 
 /* Internal variables.  */
 #define data1          x3
 #define data1w         w3
-#define data2          x4
-#define data2w         w4
-#define has_nul                x5
-#define diff           x6
-#define endloop                x7
-#define tmp1           x8
-#define tmp2           x9
-#define tmp3           x10
-#define pos            x11
-#define limit_wd       x12
-#define mask           x13
+#define data1h         x4
+#define data2          x5
+#define data2w         w5
+#define data2h         x6
+#define tmp1           x7
+#define tmp2           x8
 
 ENTRY_ALIGN (memcmp, 6)
        DELOUSE (0)
        DELOUSE (1)
        DELOUSE (2)
-       cbz     limit, L(ret0)
-       eor     tmp1, src1, src2
-       tst     tmp1, #7
-       b.ne    L(misaligned8)
-       ands    tmp1, src1, #7
-       b.ne    L(mutual_align)
-       add     limit_wd, limit, #7
-       lsr     limit_wd, limit_wd, #3
-       /* Start of performance-critical section  -- one 64B cache line.  */
-L(loop_aligned):
-       ldr     data1, [src1], #8
-       ldr     data2, [src2], #8
-L(start_realigned):
-       subs    limit_wd, limit_wd, #1
-       eor     diff, data1, data2      /* Non-zero if differences found.  */
-       csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
-       cbz     endloop, L(loop_aligned)
-       /* End of performance-critical section  -- one 64B cache line.  */
-
-       /* Not reached the limit, must have found a diff.  */
-       cbnz    limit_wd, L(not_limit)
-
-       /* Limit % 8 == 0 => all bytes significant.  */
-       ands    limit, limit, #7
-       b.eq    L(not_limit)
-
-       lsl     limit, limit, #3        /* Bits -> bytes.  */
-       mov     mask, #~0
-#ifdef __AARCH64EB__
-       lsr     mask, mask, limit
-#else
-       lsl     mask, mask, limit
-#endif
-       bic     data1, data1, mask
-       bic     data2, data2, mask
-
-       orr     diff, diff, mask
-L(not_limit):
 
-#ifndef        __AARCH64EB__
-       rev     diff, diff
+       subs    limit, limit, 8
+       b.lo    L(less8)
+
+       ldr     data1, [src1], 8
+       ldr     data2, [src2], 8
+       cmp     data1, data2
+       b.ne    L(return)
+
+       subs    limit, limit, 8
+       b.gt    L(more16)
+
+       ldr     data1, [src1, limit]
+       ldr     data2, [src2, limit]
+       b       L(return)
+
+L(more16):
+       ldr     data1, [src1], 8
+       ldr     data2, [src2], 8
+       cmp     data1, data2
+       bne     L(return)
+
+       /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+          strings.  */
+       subs    limit, limit, 16
+       b.ls    L(last_bytes)
+
+       /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+          try to align, so limit it only to strings larger than 128 bytes.  */
+       cmp     limit, 96
+       b.ls    L(loop8)
+
+       /* Align src1 and adjust src2 with bytes not yet done.  */
+       and     tmp1, src1, 15
+       add     limit, limit, tmp1
+       sub     src1, src1, tmp1
+       sub     src2, src2, tmp1
+
+       /* Loop performing 16 bytes per iteration using aligned src1.
+          Limit is pre-decremented by 16 and must be larger than zero.
+          Exit if <= 16 bytes left to do or if the data is not equal.  */
+       .p2align 4
+L(loop16):
+       ldp     data1, data1h, [src1], 16
+       ldp     data2, data2h, [src2], 16
+       subs    limit, limit, 16
+       ccmp    data1, data2, 0, hi
+       ccmp    data1h, data2h, 0, eq
+       b.eq    L(loop16)
+
+       cmp     data1, data2
+       bne     L(return)
+       mov     data1, data1h
+       mov     data2, data2h
+       cmp     data1, data2
+       bne     L(return)
+
+       /* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+       add     src1, src1, limit
+       add     src2, src2, limit
+       ldp     data1, data1h, [src1]
+       ldp     data2, data2h, [src2]
+       cmp     data1, data2
+       bne     L(return)
+       mov     data1, data1h
+       mov     data2, data2h
+       cmp     data1, data2
+
+       /* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
        rev     data1, data1
        rev     data2, data2
 #endif
-       /* The MS-non-zero bit of DIFF marks either the first bit
-          that is different, or the end of the significant data.
-          Shifting left now will bring the critical information into the
-          top bits.  */
-       clz     pos, diff
-       lsl     data1, data1, pos
-       lsl     data2, data2, pos
-       /* But we need to zero-extend (char is unsigned) the value and then
-          perform a signed 32-bit subtraction.  */
-       lsr     data1, data1, #56
-       sub     result, data1, data2, lsr #56
-       RET
-
-L(mutual_align):
-       /* Sources are mutually aligned, but are not currently at an
-          alignment boundary.  Round down the addresses and then mask off
-          the bytes that precede the start point.  */
-       bic     src1, src1, #7
-       bic     src2, src2, #7
-       add     limit, limit, tmp1      /* Adjust the limit for the extra.  */
-       lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
-       ldr     data1, [src1], #8
-       neg     tmp1, tmp1              /* Bits to alignment -64.  */
-       ldr     data2, [src2], #8
-       mov     tmp2, #~0
-#ifdef __AARCH64EB__
-       /* Big-endian.  Early bytes are at MSB.  */
-       lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
-#else
-       /* Little-endian.  Early bytes are at LSB.  */
-       lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
-#endif
-       add     limit_wd, limit, #7
-       orr     data1, data1, tmp2
-       orr     data2, data2, tmp2
-       lsr     limit_wd, limit_wd, #3
-       b       L(start_realigned)
-
-L(ret0):
-       mov     result, #0
-       RET
-
-       .p2align 6
-L(misaligned8):
-       sub     limit, limit, #1
-1:
-       /* Perhaps we can do better than this.  */
-       ldrb    data1w, [src1], #1
-       ldrb    data2w, [src2], #1
-       subs    limit, limit, #1
-       ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-       b.eq    1b
-       sub     result, data1, data2
-       RET
+       cmp     data1, data2
+L(ret_eq):
+       cset    result, ne
+       cneg    result, result, lo
+       ret
+
+       .p2align 4
+       /* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+       adds    limit, limit, 4
+       b.lo    L(less4)
+       ldr     data1w, [src1], 4
+       ldr     data2w, [src2], 4
+       cmp     data1w, data2w
+       b.ne    L(return)
+       sub     limit, limit, 4
+L(less4):
+       adds    limit, limit, 4
+       beq     L(ret_eq)
+L(byte_loop):
+       ldrb    data1w, [src1], 1
+       ldrb    data2w, [src2], 1
+       subs    limit, limit, 1
+       ccmp    data1w, data2w, 0, ne   /* NZCV = 0b0000.  */
+       b.eq    L(byte_loop)
+       sub     result, data1w, data2w
+       ret
+
 END (memcmp)
 #undef bcmp
 weak_alias (memcmp, bcmp)

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2019-09-06 16:51 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-01  0:00 [2.26 COMMITTED][AArch64] Backport memcmp improvements Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).