[2.27 COMMITTED][AArch64] Backport strncmp improvements

public inbox for libc-stable@sourceware.org
 help / color / mirror / Atom feed

* [2.27 COMMITTED][AArch64] Backport strncmp improvements
@ 2019-01-01  0:00 Wilco Dijkstra
  0 siblings, 0 replies; only message in thread
From: Wilco Dijkstra @ 2019-01-01  0:00 UTC (permalink / raw)
  To: libc-stable; +Cc: nd

commit 0fc5934ebd13ee4e1c12224d0e9206b76d10c626
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Thu Mar 15 08:06:21 2018 +0530

    aarch64/strncmp: Use lsr instead of mov+lsr
    
    A lsr can do what the mov and lsr did.
    
    (cherry picked from commit b47c3e7637efb77818cbef55dcd0ed1f0ea0ddf1)

commit e0a0bd3accf5837d8fa6b6907923c13b9ac0e0ff
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Wed Mar 14 18:51:05 2018 +0530

    aarch64/strncmp: Unbreak builds with old binutils
    
    Binutils 2.26.* and older do not support moves with shifted registers,
    so use a separate shift instruction instead.
    
    (cherry picked from commit d46f84de745db8f3f06a37048261f4e5ceacf0a3)

commit 638caf300022aab68f4c464f57e8b8ac879f5735
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Tue Mar 13 23:57:03 2018 +0530

    aarch64: Improve strncmp for mutually misaligned inputs
    
    The mutually misaligned inputs on aarch64 are compared with a simple
    byte copy, which is not very efficient.  Enhance the comparison
    similar to strcmp by loading a double-word at a time.  The peak
    performance improvement (i.e. 4k maxlen comparisons) due to this on
    the strncmp microbenchmark is as follows:
    
    falkor: 3.5x (up to 72% time reduction)
    cortex-a73: 3.5x (up to 71% time reduction)
    cortex-a53: 3.5x (up to 71% time reduction)
    
    All mutually misaligned inputs from 16 bytes maxlen onwards show
    upwards of 15% improvement and there is no measurable effect on the
    performance of aligned/mutually aligned inputs.
    
        * sysdeps/aarch64/strncmp.S (count): New macro.
        (strncmp): Store misaligned length in SRC1 in COUNT.
        (mutual_align): Adjust.
        (misaligned8): Load dword at a time when it is safe.
    
    (cherry picked from commit 7108f1f944792ac68332967015d5e6418c5ccc88)


diff --git a/ChangeLog b/ChangeLog
index 5ddc3e7..99b6180 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,22 @@
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
+       * sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of
+       mov + lsr.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+       * sysdeps/aarch64/strncmp.S (strncmp): Use a separate shift
+       instruction to unbreak builds with binutils 2.26 and older.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+       * sysdeps/aarch64/strncmp.S (count): New macro.
+       (strncmp): Store misaligned length in SRC1 in COUNT.
+       (mutual_align): Adjust.
+       (misaligned8): Load dword at a time when it is safe.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
        * sysdeps/aarch64/strcmp.S (do_misaligned): Jump back to
        do_misaligned, not misaligned8.
 
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index a08d2c0..759c752 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -49,6 +49,7 @@
 #define limit_wd       x13
 #define mask           x14
 #define endloop                x15
+#define count          mask
 
 ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
        DELOUSE (0)
@@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
        eor     tmp1, src1, src2
        mov     zeroones, #REP8_01
        tst     tmp1, #7
+       and     count, src1, #7
        b.ne    L(misaligned8)
-       ands    tmp1, src1, #7
-       b.ne    L(mutual_align)
+       cbnz    count, L(mutual_align)
        /* Calculate the number of full and partial words -1.  */
        sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
        lsr     limit_wd, limit_wd, #3  /* Convert to Dwords.  */
@@ -165,43 +166,107 @@ L(mutual_align):
        bic     src1, src1, #7
        bic     src2, src2, #7
        ldr     data1, [src1], #8
-       neg     tmp3, tmp1, lsl #3      /* 64 - bits(bytes beyond align). */
+       neg     tmp3, count, lsl #3     /* 64 - bits(bytes beyond align). */
        ldr     data2, [src2], #8
        mov     tmp2, #~0
        sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
 #ifdef __AARCH64EB__
        /* Big-endian.  Early bytes are at MSB.  */
-       lsl     tmp2, tmp2, tmp3        /* Shift (tmp1 & 63).  */
+       lsl     tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 #else
        /* Little-endian.  Early bytes are at LSB.  */
-       lsr     tmp2, tmp2, tmp3        /* Shift (tmp1 & 63).  */
+       lsr     tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 #endif
        and     tmp3, limit_wd, #7
        lsr     limit_wd, limit_wd, #3
        /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-       add     limit, limit, tmp1
-       add     tmp3, tmp3, tmp1
+       add     limit, limit, count
+       add     tmp3, tmp3, count
        orr     data1, data1, tmp2
        orr     data2, data2, tmp2
        add     limit_wd, limit_wd, tmp3, lsr #3
        b       L(start_realigned)
 
-L(ret0):
-       mov     result, #0
-       RET
-
        .p2align 6
+       /* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
-       sub     limit, limit, #1
-1:
+       cmp     limit, #16
+       b.hs    L(try_misaligned_words)
+
+L(byte_loop):
        /* Perhaps we can do better than this.  */
        ldrb    data1w, [src1], #1
        ldrb    data2w, [src2], #1
        subs    limit, limit, #1
-       ccmp    data1w, #1, #0, cs      /* NZCV = 0b0000.  */
+       ccmp    data1w, #1, #0, hi      /* NZCV = 0b0000.  */
        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-       b.eq    1b
+       b.eq    L(byte_loop)
+L(done):
        sub     result, data1, data2
        RET
+
+       /* Align the SRC1 to a dword by doing a bytewise compare and then do
+          the dword loop.  */
+L(try_misaligned_words):
+       lsr     limit_wd, limit, #3
+       cbz     count, L(do_misaligned)
+
+       neg     count, count
+       and     count, count, #7
+       sub     limit, limit, count
+       lsr     limit_wd, limit, #3
+
+L(page_end_loop):
+       ldrb    data1w, [src1], #1
+       ldrb    data2w, [src2], #1
+       cmp     data1w, #1
+       ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+       b.ne    L(done)
+       subs    count, count, #1
+       b.hi    L(page_end_loop)
+
+L(do_misaligned):
+       /* Prepare ourselves for the next page crossing.  Unlike the aligned
+          loop, we fetch 1 less dword because we risk crossing bounds on
+          SRC2.  */
+       mov     count, #8
+       subs    limit_wd, limit_wd, #1
+       b.lo    L(done_loop)
+L(loop_misaligned):
+       and     tmp2, src2, #0xff8
+       eor     tmp2, tmp2, #0xff8
+       cbz     tmp2, L(page_end_loop)
+
+       ldr     data1, [src1], #8
+       ldr     data2, [src2], #8
+       sub     tmp1, data1, zeroones
+       orr     tmp2, data1, #REP8_7f
+       eor     diff, data1, data2      /* Non-zero if differences found.  */
+       bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
+       ccmp    diff, #0, #0, eq
+       b.ne    L(not_limit)
+       subs    limit_wd, limit_wd, #1
+       b.pl    L(loop_misaligned)
+
+L(done_loop):
+       /* We found a difference or a NULL before the limit was reached.  */
+       and     limit, limit, #7
+       cbz     limit, L(not_limit)
+       /* Read the last word.  */
+       sub     src1, src1, 8
+       sub     src2, src2, 8
+       ldr     data1, [src1, limit]
+       ldr     data2, [src2, limit]
+       sub     tmp1, data1, zeroones
+       orr     tmp2, data1, #REP8_7f
+       eor     diff, data1, data2      /* Non-zero if differences found.  */
+       bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
+       ccmp    diff, #0, #0, eq
+       b.ne    L(not_limit)
+
+L(ret0):
+       mov     result, #0
+       RET
+
 END (strncmp)
 libc_hidden_builtin_def (strncmp)

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2019-09-06 16:47 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-01  0:00 [2.27 COMMITTED][AArch64] Backport strncmp improvements Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).