public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 0/3] Add ifunc support for str{nlen, cmp, ncmp}
@ 2023-08-22  2:11 dengjianbo
  2023-08-22  2:11 ` [PATCH 1/3] Loongarch: Add ifunc support for strnlen{aligned, lsx, lasx} dengjianbo
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: dengjianbo @ 2023-08-22  2:11 UTC (permalink / raw)
  To: libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei, dengjianbo

This patch add mutiple versions of strnlen, strcmp, strncmp implemented
by Loongarch basic instructions, LSX instructions, LASX instructions.
Even though this implementation experience performance degradation in
few cases, overall, the performace gains are significant.

See:
https://github.com/jiadengx/glibc_test/blob/main/bench/strnlen_compare.out
https://github.com/jiadengx/glibc_test/blob/main/bench/strcmp_compare.out
https://github.com/jiadengx/glibc_test/blob/main/bench/strncmp_compare.out

In the data, positive values in the parentheses indicate that our
implementation took less time, indicating a performance improvement;
negative values in the parentheses mean that our implementation took
more time, indicating a decrease in performance. Following is the
summarise of the performance comparing with the generic version in the
glibc microbenchmark:

name                  reduce time percent
strnlen-aligned       >10%
strnlen-lsx           50%-78%
strnlen-lasx          50%-88%
strcmp-aligned        0%-10% for aligned comparision
                      10%-20% for unaligned comparision
strcmp-lsx            0%-50%
strncmp-aligned       0%-10% for aligned comparision
                      10%-25% for unaligned comparision
strncmp-lsx           0%-50%

dengjianbo (3):
  Loongarch: Add ifunc support for strnlen{aligned, lsx, lasx}
  Loongarch: Add ifunc support for strcmp{aligned, lsx}
  Loongarch: Add ifunc support for strncmp{aligned, lsx}

 sysdeps/loongarch/lp64/multiarch/Makefile     |   7 +
 .../lp64/multiarch/ifunc-impl-list.c          |  22 ++
 .../loongarch/lp64/multiarch/ifunc-strcmp.h   |  38 +++
 .../loongarch/lp64/multiarch/ifunc-strncmp.h  |  38 +++
 .../loongarch/lp64/multiarch/ifunc-strnlen.h  |  41 ++++
 .../loongarch/lp64/multiarch/strcmp-aligned.S | 179 ++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 162 +++++++++++++
 sysdeps/loongarch/lp64/multiarch/strcmp.c     |  35 +++
 .../lp64/multiarch/strncmp-aligned.S          | 218 ++++++++++++++++++
 .../loongarch/lp64/multiarch/strncmp-lsx.S    | 206 +++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strncmp.c    |  35 +++
 .../lp64/multiarch/strnlen-aligned.S          | 102 ++++++++
 .../loongarch/lp64/multiarch/strnlen-lasx.S   | 100 ++++++++
 .../loongarch/lp64/multiarch/strnlen-lsx.S    |  89 +++++++
 sysdeps/loongarch/lp64/multiarch/strnlen.c    |  39 ++++
 15 files changed, 1311 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp.c
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp.c
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen.c

-- 
2.40.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] Loongarch: Add ifunc support for strnlen{aligned, lsx, lasx}
  2023-08-22  2:11 [PATCH 0/3] Add ifunc support for str{nlen, cmp, ncmp} dengjianbo
@ 2023-08-22  2:11 ` dengjianbo
  2023-08-22  2:11 ` [PATCH 2/3] Loongarch: Add ifunc support for strcmp{aligned, lsx} dengjianbo
  2023-08-22  2:11 ` [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx} dengjianbo
  2 siblings, 0 replies; 9+ messages in thread
From: dengjianbo @ 2023-08-22  2:11 UTC (permalink / raw)
  To: libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei, dengjianbo

Based on the glibc microbenchmark, strnlen-aligned implementation could
reduce the runtime more than 10%, strnlen-lsx implementation could reduce
the runtime about 50%-78%, strnlen-lasx implementation could reduce the
runtime about 50%-88%.
---
 sysdeps/loongarch/lp64/multiarch/Makefile     |   3 +
 .../lp64/multiarch/ifunc-impl-list.c          |   8 ++
 .../loongarch/lp64/multiarch/ifunc-strnlen.h  |  41 +++++++
 .../lp64/multiarch/strnlen-aligned.S          | 102 ++++++++++++++++++
 .../loongarch/lp64/multiarch/strnlen-lasx.S   | 100 +++++++++++++++++
 .../loongarch/lp64/multiarch/strnlen-lsx.S    |  89 +++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strnlen.c    |  39 +++++++
 7 files changed, 382 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen.c

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index afa5104174..c4dd3143d1 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -3,6 +3,9 @@ sysdep_routines += \
   strlen-aligned \
   strlen-lsx \
   strlen-lasx \
+  strnlen-aligned \
+  strnlen-lsx \
+  strnlen-lasx \
   strchr-aligned \
   strchr-lsx \
   strchr-lasx \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 25eb96b061..7cec0b7724 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -38,6 +38,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
 	      )
 
+  IFUNC_IMPL (i, name, strnlen,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LASX, __strnlen_lasx)
+	      IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LSX, __strnlen_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned)
+	      )
+
   IFUNC_IMPL (i, name, strchr,
 #if !defined __loongarch_soft_float
 	      IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
new file mode 100644
index 0000000000..5cf8981021
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
@@ -0,0 +1,41 @@
+/* Common definition for strnlen ifunc selections.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+  if (SUPPORT_LASX)
+    return OPTIMIZE (lasx);
+  else if (SUPPORT_LSX)
+    return OPTIMIZE (lsx);
+  else
+#endif
+    return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
new file mode 100644
index 0000000000..b900430a5d
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
@@ -0,0 +1,102 @@
+/* Optimized strnlen implementation using basic Loongarch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRNLEN __strnlen_aligned
+#else
+# define STRNLEN __strnlen
+#endif
+
+LEAF(STRNLEN, 6)
+    beqz        a1, L(out)
+    lu12i.w     a2, 0x01010
+    andi        t1, a0, 0x7
+    move        t4, a0
+
+    bstrins.d   a0, zero, 2, 0
+    ori         a2, a2, 0x101
+    li.w        t0, -1
+    ld.d        t2, a0, 0
+
+    slli.d      t3, t1, 3
+    bstrins.d   a2, a2, 63, 32
+    li.w        t5, 8
+    slli.d      a3, a2, 7
+
+    sub.w       t1, t5, t1
+    sll.d       t0, t0, t3
+    orn         t2, t2, t0
+    sub.d       t0, t2, a2
+
+
+    andn        t3, a3, t2
+    and         t0, t0, t3
+    bnez        t0, L(count_pos)
+    sub.d       t5, a1, t1
+
+    bgeu        t1, a1, L(out)
+    addi.d      a0, a0, 8
+L(loop):
+    ld.d        t2, a0, 0
+    sub.d       t0, t2, a2
+
+    andn        t1, a3, t2
+    sltui       t6, t5, 9
+    and         t0, t0, t1
+    or          t7, t0, t6
+
+    bnez        t7, L(count_pos)
+    ld.d        t2, a0, 8
+    addi.d      a0, a0, 16
+    sub.d       t0, t2, a2
+
+
+    andn        t1, a3, t2
+    sltui       t6, t5, 17
+    and         t0, t0, t1
+    addi.d      t5, t5, -16
+
+    or          t7, t0, t6
+    beqz        t7, L(loop)
+    addi.d      a0, a0, -8
+L(count_pos):
+    ctz.d       t1, t0
+
+    sub.d       a0, a0, t4
+    srli.d      t1, t1, 3
+    add.d       a0, t1, a0
+    sltu        t0, a0, a1
+
+    masknez     t1, a1, t0
+    maskeqz     a0, a0, t0
+    or          a0, a0, t1
+    jr          ra
+
+
+L(out):
+    move        a0, a1
+    jr          ra
+END(STRNLEN)
+
+weak_alias (STRNLEN, strnlen)
+libc_hidden_builtin_def (STRNLEN)
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
new file mode 100644
index 0000000000..2c03d3d9b4
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
@@ -0,0 +1,100 @@
+/* Optimized strnlen implementation using loongarch LASX instructions
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRNLEN __strnlen_lasx
+
+LEAF(STRNLEN, 6)
+    beqz            a1, L(ret0)
+    andi            t1, a0, 0x3f
+    li.d            t3, 65
+    sub.d           a2, a0, t1
+
+    xvld            xr0, a2, 0
+    xvld            xr1, a2, 32
+    sub.d           t1, t3, t1
+    move            a3, a0
+
+    sltu            t1, a1, t1
+    xvmsknz.b       xr0, xr0
+    xvmsknz.b       xr1, xr1
+    xvpickve.w      xr2, xr0, 4
+
+    xvpickve.w      xr3, xr1, 4
+    vilvl.h         vr0, vr2, vr0
+    vilvl.h         vr1, vr3, vr1
+    vilvl.w         vr0, vr1, vr0
+
+
+    movfr2gr.d      t0, fa0
+    sra.d           t0, t0, a0
+    orn             t1, t1, t0
+    bnez            t1, L(end)
+
+    add.d           a4, a0, a1
+    move            a0, a2
+    addi.d          a4, a4, -1
+    bstrins.d       a4, zero, 5, 0
+
+L(loop):
+    xvld            xr0, a0, 64
+    xvld            xr1, a0, 96
+    addi.d          a0, a0, 64
+    beq             a0, a4, L(out)
+
+    xvmin.bu        xr2, xr0, xr1
+    xvsetanyeqz.b   fcc0, xr2
+    bceqz           fcc0, L(loop)
+L(out):
+    xvmsknz.b       xr0, xr0
+
+
+    xvmsknz.b       xr1, xr1
+    xvpickve.w      xr2, xr0, 4
+    xvpickve.w      xr3, xr1, 4
+    vilvl.h         vr0, vr2, vr0
+
+    vilvl.h         vr1, vr3, vr1
+    vilvl.w         vr0, vr1, vr0
+    movfr2gr.d      t0, fa0
+L(end):
+    sub.d           a0, a0, a3
+
+    cto.d           t0, t0
+    add.d           a0, a0, t0
+    sltu            t1, a0, a1
+    masknez         t0, a1, t1
+
+    maskeqz         t1, a0, t1
+    or              a0, t0, t1
+    jr              ra
+L(ret0):
+    move            a0, zero
+
+
+    jr              ra
+END(STRNLEN)
+
+libc_hidden_def (STRNLEN)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
new file mode 100644
index 0000000000..b769a89584
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
@@ -0,0 +1,89 @@
+/* Optimized strnlen implementation using loongarch LSX instructions
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRNLEN __strnlen_lsx
+
+LEAF(STRNLEN, 6)
+    beqz            a1, L(ret0)
+    andi            t1, a0, 0x1f
+    li.d            t3, 33
+    sub.d           a2, a0, t1
+
+    vld             vr0, a2, 0
+    vld             vr1, a2, 16
+    sub.d           t1, t3, t1
+    move            a3, a0
+
+    sltu            t1, a1, t1
+    vmsknz.b        vr0, vr0
+    vmsknz.b        vr1, vr1
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+    sra.w           t0, t0, a0
+    orn             t1, t1, t0
+    bnez            t1, L(end)
+
+
+    add.d           a4, a0, a1
+    move            a0, a2
+    addi.d          a4, a4, -1
+    bstrins.d       a4, zero, 4, 0
+
+L(loop):
+    vld             vr0, a0, 32
+    vld             vr1, a0, 48
+    addi.d          a0, a0, 32
+    beq             a0, a4, L(out)
+
+    vmin.bu         vr2, vr0, vr1
+    vsetanyeqz.b    fcc0, vr2
+    bceqz           fcc0, L(loop)
+L(out):
+    vmsknz.b        vr0, vr0
+
+    vmsknz.b        vr1, vr1
+    vilvl.h         vr0, vr1, vr0
+    movfr2gr.s      t0, fa0
+L(end):
+    sub.d           a0, a0, a3
+
+
+    cto.w           t0, t0
+    add.d           a0, a0, t0
+    sltu            t1, a0, a1
+    masknez         t0, a1, t1
+
+    maskeqz         t1, a0, t1
+    or              a0, t0, t1
+    jr              ra
+L(ret0):
+    move            a0, zero
+
+    jr              ra
+END(STRNLEN)
+
+libc_hidden_builtin_def (STRNLEN)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen.c b/sysdeps/loongarch/lp64/multiarch/strnlen.c
new file mode 100644
index 0000000000..38b7a25a7a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen.c
@@ -0,0 +1,39 @@
+/* Multiple versions of strnlen.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define strnlen __redirect_strnlen
+# define __strnlen __redirect___strnlen
+# include <string.h>
+# undef __strnlen
+# undef strnlen
+
+# define SYMBOL_NAME strnlen
+# include "ifunc-strnlen.h"
+
+libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ());
+weak_alias (__strnlen, strnlen);
+# ifdef SHARED
+__hidden_ver1 (__strnlen, __GI___strnlen, __redirect___strnlen)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (strnlen);
+__hidden_ver1 (strnlen, __GI_strnlen, __redirect_strnlen)
+  __attribute__((weak, visibility ("hidden"))) __attribute_copy__ (strnlen);
+# endif
+#endif
-- 
2.40.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/3] Loongarch: Add ifunc support for strcmp{aligned, lsx}
  2023-08-22  2:11 [PATCH 0/3] Add ifunc support for str{nlen, cmp, ncmp} dengjianbo
  2023-08-22  2:11 ` [PATCH 1/3] Loongarch: Add ifunc support for strnlen{aligned, lsx, lasx} dengjianbo
@ 2023-08-22  2:11 ` dengjianbo
  2023-08-22  2:11 ` [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx} dengjianbo
  2 siblings, 0 replies; 9+ messages in thread
From: dengjianbo @ 2023-08-22  2:11 UTC (permalink / raw)
  To: libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei, dengjianbo

Based on the glibc microbenchmark, strcmp-aligned implementation could
reduce the runtime 0%-10% for aligned comparison, 10%-20% for unaligned
comparison, strcmp-lsx implemenation could reduce the runtime 0%-50%.
---
 sysdeps/loongarch/lp64/multiarch/Makefile     |   2 +
 .../lp64/multiarch/ifunc-impl-list.c          |   7 +
 .../loongarch/lp64/multiarch/ifunc-strcmp.h   |  38 ++++
 .../loongarch/lp64/multiarch/strcmp-aligned.S | 179 ++++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 162 ++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strcmp.c     |  35 ++++
 6 files changed, 423 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp.c

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index c4dd3143d1..d5a500decd 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -12,6 +12,8 @@ sysdep_routines += \
   strchrnul-aligned \
   strchrnul-lsx \
   strchrnul-lasx \
+  strcmp-aligned \
+  strcmp-lsx \
   memcpy-aligned \
   memcpy-unaligned \
   memmove-unaligned \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 7cec0b7724..9183b7da24 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -62,6 +62,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
 	      )
 
+  IFUNC_IMPL (i, name, strcmp,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_LSX, __strcmp_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
+	      )
+
   IFUNC_IMPL (i, name, memcpy,
 #if !defined __loongarch_soft_float
               IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
new file mode 100644
index 0000000000..ca26352bec
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
@@ -0,0 +1,38 @@
+/* Common definition for strcmp ifunc selection.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+  if (SUPPORT_LSX)
+    return OPTIMIZE (lsx);
+  else
+#endif
+    return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
new file mode 100644
index 0000000000..f5f4f3364e
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
@@ -0,0 +1,179 @@
+/* Optimized strcmp implementation using basic Loongarch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRCMP_NAME __strcmp_aligned
+#else
+# define STRCMP_NAME strcmp
+#endif
+
+LEAF(STRCMP_NAME, 6)
+    lu12i.w     a4, 0x01010
+    andi        a2, a0, 0x7
+    ori         a4, a4, 0x101
+    andi        a3, a1, 0x7
+
+    bstrins.d   a4, a4, 63, 32
+    li.d        t7, -1
+    li.d        t8, 8
+    slli.d      a5, a4, 7
+
+    bne         a2, a3, L(unaligned)
+    bstrins.d   a0, zero, 2, 0
+    bstrins.d   a1, zero, 2, 0
+    ld.d        t0, a0, 0
+
+    ld.d        t1, a1, 0
+    slli.d      t3, a2, 3
+    sll.d       t2, t7, t3
+    orn         t0, t0, t2
+
+
+    orn         t1, t1, t2
+    sub.d       t2, t0, a4
+    andn        t3, a5, t0
+    and         t2, t2, t3
+
+    bne         t0, t1, L(al_end)
+L(al_loop):
+    bnez        t2, L(ret0)
+    ldx.d       t0, a0, t8
+    ldx.d       t1, a1, t8
+
+    addi.d      t8, t8, 8
+    sub.d       t2, t0, a4
+    andn        t3, a5, t0
+    and         t2, t2, t3
+
+    beq         t0, t1, L(al_loop)
+L(al_end):
+    xor         t3, t0, t1
+    or          t2, t2, t3
+    ctz.d       t3, t2
+
+
+    bstrins.d   t3, zero, 2, 0
+    srl.d       t0, t0, t3
+    srl.d       t1, t1, t3
+    andi        t0, t0, 0xff
+
+    andi        t1, t1, 0xff
+    sub.d       a0, t0, t1
+    jr          ra
+    nop
+
+L(ret0):
+    move        a0, zero
+    jr          ra
+    nop
+    nop
+
+L(unaligned):
+    slt         a6, a3, a2
+    xor         t0, a0, a1
+    maskeqz     t0, t0, a6
+    xor         a0, a0, t0
+
+
+    xor         a1, a1, t0
+    andi        a2, a0, 0x7
+    andi        a3, a1, 0x7
+    bstrins.d   a0, zero, 2, 0
+
+    bstrins.d   a1, zero, 2, 0
+    ld.d        t4, a0, 0
+    ld.d        t1, a1, 0
+    slli.d      a2, a2, 3
+
+    slli.d      a3, a3, 3
+    srl.d       t0, t4, a2
+    srl.d       t1, t1, a3
+    srl.d       t5, t7, a3
+
+    orn         t0, t0, t5
+    orn         t1, t1, t5
+    bne         t0, t1, L(not_equal)
+    sll.d       t5, t7, a2
+
+
+    sub.d       a3, a2, a3
+    orn         t4, t4, t5
+    sub.d       a2, zero, a3
+    sub.d       t2, t4, a4
+
+    andn        t3, a5, t4
+    and         t2, t2, t3
+    bnez        t2, L(find_zero)
+L(un_loop):
+    srl.d       t5, t4, a3
+
+    ldx.d       t4, a0, t8
+    ldx.d       t1, a1, t8
+    addi.d      t8, t8, 8
+    sll.d       t0, t4, a2
+
+    or          t0, t0, t5
+    bne         t0, t1, L(not_equal)
+    sub.d       t2, t4, a4
+    andn        t3, a5, t4
+
+
+    and         t2, t2, t3
+    beqz        t2, L(un_loop)
+L(find_zero):
+    sub.d       t2, t0, a4
+    andn        t3, a5, t0
+
+    and         t2, t2, t3
+    bnez        t2, L(ret0)
+    ldx.d       t1, a1, t8
+    srl.d       t0, t4, a3
+
+L(not_equal):
+    sub.d       t2, t0, a4
+    andn        t3, a5, t0
+    and         t2, t2, t3
+    xor         t3, t0, t1
+
+    or          t2, t2, t3
+L(un_end):
+    ctz.d       t3, t2
+    bstrins.d   t3, zero, 2, 0
+    srl.d       t0, t0, t3
+
+
+    srl.d       t1, t1, t3
+    andi        t0, t0, 0xff
+    andi        t1, t1, 0xff
+    sub.d       t2, t0, t1
+
+
+    sub.d       t3, t1, t0
+    masknez     t0, t2, a6
+    maskeqz     t1, t3, a6
+    or          a0, t0, t1
+
+    jr	ra
+END(STRCMP_NAME)
+
+libc_hidden_builtin_def (STRCMP_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
new file mode 100644
index 0000000000..19776d9cb2
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
@@ -0,0 +1,162 @@
+/* Optimized strcmp implementation using Loongarch LSX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRCMP	__strcmp_lsx
+
+L(magic_num):
+    .align		6
+    .dword		0x0706050403020100
+    .dword		0x0f0e0d0c0b0a0908
+ENTRY_NO_ALIGN(STRCMP)
+    pcaddi          t0, -4
+    andi            a2, a0, 0xf
+    vld             vr2, t0, 0
+    andi            a3, a1, 0xf
+
+    bne             a2, a3, L(unaligned)
+    bstrins.d       a0, zero, 3, 0
+    bstrins.d       a1, zero, 3, 0
+    vld             vr0, a0, 0
+
+    vld             vr1, a1, 0
+    vreplgr2vr.b    vr3, a2
+    vslt.b          vr2, vr2, vr3
+    vseq.b          vr3, vr0, vr1
+
+
+    vmin.bu         vr3, vr0, vr3
+    vor.v           vr3, vr3, vr2
+    vsetanyeqz.b    fcc0, vr3
+    bcnez           fcc0, L(al_out)
+
+L(al_loop):
+    vld             vr0, a0, 16
+    vld             vr1, a1, 16
+    addi.d          a0, a0, 16
+    addi.d          a1, a1, 16
+
+    vseq.b          vr3, vr0, vr1
+    vmin.bu         vr3, vr0, vr3
+    vsetanyeqz.b    fcc0, vr3
+    bceqz           fcc0, L(al_loop)
+
+L(al_out):
+    vseqi.b         vr3, vr3, 0
+    vfrstpi.b       vr3, vr3, 0
+    vshuf.b         vr0, vr0, vr0, vr3
+    vshuf.b         vr1, vr1, vr1, vr3
+
+
+    vpickve2gr.bu   t0, vr0, 0
+    vpickve2gr.bu   t1, vr1, 0
+    sub.d           a0, t0, t1
+    jr              ra
+
+L(unaligned):
+    slt             a4, a3, a2
+    xor             t0, a0, a1
+    maskeqz         t0, t0, a4
+    xor             a0, a0, t0
+
+    xor             a1, a1, t0
+    andi            a2, a0, 0xf
+    andi            a3, a1, 0xf
+    bstrins.d       a0, zero, 3, 0
+
+    bstrins.d       a1, zero, 3, 0
+    vld             vr3, a0, 0
+    vld             vr1, a1, 0
+    vreplgr2vr.b    vr4, a2
+
+
+    vreplgr2vr.b    vr5, a3
+    vslt.b          vr7, vr2, vr5
+    vsub.b          vr5, vr5, vr4
+    vaddi.bu        vr6, vr2, 16
+
+    vsub.b          vr6, vr6, vr5
+    vshuf.b         vr0, vr3, vr3, vr6
+    vor.v           vr0, vr0, vr7
+    vor.v           vr1, vr1, vr7
+
+    vseq.b          vr5, vr0, vr1
+    vsetanyeqz.b    fcc0, vr5
+    bcnez           fcc0, L(not_equal)
+    vslt.b          vr4, vr2, vr4
+
+    vor.v           vr0, vr3, vr4
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(find_zero)
+    nop
+
+L(un_loop):
+    vld             vr3, a0, 16
+    vld             vr1, a1, 16
+    addi.d          a0, a0, 16
+    addi.d          a1, a1, 16
+
+    vshuf.b         vr0, vr3, vr0, vr6
+    vseq.b          vr5, vr0, vr1
+    vsetanyeqz.b    fcc0, vr5
+    bcnez           fcc0, L(not_equal)
+
+    vsetanyeqz.b    fcc0, vr3
+    vor.v           vr0, vr3, vr3
+    bceqz           fcc0, L(un_loop)
+L(find_zero):
+    vmin.bu         vr5, vr1, vr5
+
+    vsetanyeqz.b    fcc0, vr5
+    bcnez           fcc0, L(ret0)
+    vld             vr1, a1, 16
+    vshuf.b         vr0, vr3, vr3, vr6
+
+
+    vseq.b          vr5, vr0, vr1
+L(not_equal):
+    vmin.bu         vr5, vr0, vr5
+L(un_end):
+    vseqi.b         vr5, vr5, 0
+    vfrstpi.b       vr5, vr5, 0
+
+    vshuf.b         vr0, vr0, vr0, vr5
+    vshuf.b         vr1, vr1, vr1, vr5
+    vpickve2gr.bu   t0, vr0, 0
+    vpickve2gr.bu   t1, vr1, 0
+
+    sub.d           t3, t0, t1
+    sub.d           t4, t1, t0
+    masknez         t0, t3, a4
+    maskeqz         t1, t4, a4
+
+    or              a0, t0, t1
+    jr              ra
+L(ret0):
+    move            a0, zero
+    jr              ra
+END(STRCMP)
+
+libc_hidden_builtin_def (STRCMP)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp.c b/sysdeps/loongarch/lp64/multiarch/strcmp.c
new file mode 100644
index 0000000000..6f249c0b59
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strcmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define strcmp __redirect_strcmp
+# include <string.h>
+# undef strcmp
+
+# define SYMBOL_NAME strcmp
+# include "ifunc-strcmp.h"
+
+libc_ifunc_redirected (__redirect_strcmp, strcmp, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strcmp, __GI_strcmp, __redirect_strcmp)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strcmp);
+# endif
+#endif
-- 
2.40.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx}
  2023-08-22  2:11 [PATCH 0/3] Add ifunc support for str{nlen, cmp, ncmp} dengjianbo
  2023-08-22  2:11 ` [PATCH 1/3] Loongarch: Add ifunc support for strnlen{aligned, lsx, lasx} dengjianbo
  2023-08-22  2:11 ` [PATCH 2/3] Loongarch: Add ifunc support for strcmp{aligned, lsx} dengjianbo
@ 2023-08-22  2:11 ` dengjianbo
  2023-08-22  3:56   ` Richard Henderson
  2 siblings, 1 reply; 9+ messages in thread
From: dengjianbo @ 2023-08-22  2:11 UTC (permalink / raw)
  To: libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei, dengjianbo

Based on the glibc microbenchmark, only a few short inputs with this
strncmp-aligned and strncmp-lsx implementation experience performance
degradation, overall, strncmp-aligned could reduce the runtime 0%-10%
for aligned comparision, 10%-25% for unaligend comparision, strncmp-lsx
could reduce the runtime about 0%-60%.
---
 sysdeps/loongarch/lp64/multiarch/Makefile     |   2 +
 .../lp64/multiarch/ifunc-impl-list.c          |   7 +
 .../loongarch/lp64/multiarch/ifunc-strncmp.h  |  38 +++
 .../lp64/multiarch/strncmp-aligned.S          | 218 ++++++++++++++++++
 .../loongarch/lp64/multiarch/strncmp-lsx.S    | 206 +++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strncmp.c    |  35 +++
 6 files changed, 506 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp.c

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index d5a500decd..5d7ae7ae73 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -14,6 +14,8 @@ sysdep_routines += \
   strchrnul-lasx \
   strcmp-aligned \
   strcmp-lsx \
+  strncmp-aligned \
+  strncmp-lsx \
   memcpy-aligned \
   memcpy-unaligned \
   memmove-unaligned \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 9183b7da24..c8ba87bd81 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -69,6 +69,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
 	      )
 
+  IFUNC_IMPL (i, name, strncmp,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
+	      )
+
   IFUNC_IMPL (i, name, memcpy,
 #if !defined __loongarch_soft_float
               IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
new file mode 100644
index 0000000000..1a7dc36ba6
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
@@ -0,0 +1,38 @@
+/* Common definition for strncmp ifunc selection.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+  if (SUPPORT_LSX)
+    return OPTIMIZE (lsx);
+  else
+#endif
+    return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
new file mode 100644
index 0000000000..e2687fa770
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
@@ -0,0 +1,218 @@
+/* Optimized strncmp implementation using basic Loongarch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRNCMP __strncmp_aligned
+#else
+# define STRNCMP strncmp
+#endif
+
+LEAF(STRNCMP, 6)
+    beqz        a2, L(ret0)
+    lu12i.w     a5, 0x01010
+    andi        a3, a0, 0x7
+    ori         a5, a5, 0x101
+
+    andi        a4, a1, 0x7
+    bstrins.d   a5, a5, 63, 32
+    li.d        t7, -1
+    li.d        t8, 8
+
+    addi.d      a2, a2, -1
+    slli.d      a6, a5, 7
+    bne         a3, a4, L(unaligned)
+    bstrins.d   a0, zero, 2, 0
+
+    bstrins.d   a1, zero, 2, 0
+    ld.d        t0, a0, 0
+    ld.d        t1, a1, 0
+    slli.d      t2, a3, 3
+
+
+    sub.d       t5, t8, a3
+    srl.d       t3, t7, t2
+    srl.d       t0, t0, t2
+    srl.d       t1, t1, t2
+
+    orn         t0, t0, t3
+    orn         t1, t1, t3
+    sub.d       t2, t0, a5
+    andn        t3, a6, t0
+
+    and         t2, t2, t3
+    bne         t0, t1, L(al_end)
+    sltu        t4, a2, t5
+    sub.d       a2, a2, t5
+
+L(al_loop):
+    or          t4, t2, t4
+    bnez        t4, L(ret0)
+    ldx.d       t0, a0, t8
+    ldx.d       t1, a1, t8
+
+
+    addi.d      t8, t8, 8
+    sltui       t4, a2, 8
+    addi.d      a2, a2, -8
+    sub.d       t2, t0, a5
+
+    andn        t3, a6, t0
+    and         t2, t2, t3
+    beq         t0, t1, L(al_loop)
+    addi.d      a2, a2, 8
+
+L(al_end):
+    xor         t3, t0, t1
+    or          t2, t2, t3
+    ctz.d       t2, t2
+    srli.d      t4, t2, 3
+
+    bstrins.d   t2, zero, 2, 0
+    srl.d       t0, t0, t2
+    srl.d       t1, t1, t2
+    andi        t0, t0, 0xff
+
+
+    andi        t1, t1, 0xff
+    sltu        t2, a2, t4
+    sub.d       a0, t0, t1
+    masknez     a0, a0, t2
+
+    jr          ra
+L(ret0):
+    move        a0, zero
+    jr          ra
+    nop
+
+L(unaligned):
+    slt         a7, a4, a3
+    xor         t0, a0, a1
+    maskeqz     t0, t0, a7
+    xor         a0, a0, t0
+
+    xor         a1, a1, t0
+    andi        a3, a0, 0x7
+    andi        a4, a1, 0x7
+    bstrins.d   a0, zero, 2, 0
+
+
+    bstrins.d   a1, zero, 2, 0
+    ld.d        t4, a0, 0
+    ld.d        t1, a1, 0
+    slli.d      t2, a3, 3
+
+    slli.d      t3, a4, 3
+    srl.d       t5, t7, t3
+    srl.d       t0, t4, t2
+    srl.d       t1, t1, t3
+
+    orn         t0, t0, t5
+    orn         t1, t1, t5
+    bne         t0, t1, L(not_equal)
+    sub.d       t6, t8, a4
+
+    sub.d       a4, t2, t3
+    sll.d       t2, t7, t2
+    sub.d       t5, t8, a3
+    orn         t4, t4, t2
+
+
+    sub.d       t2, t4, a5
+    andn        t3, a6, t4
+    sltu        t7, a2, t5
+    and         t2, t2, t3
+
+    sub.d       a3, zero, a4
+    or          t2, t2, t7
+    bnez        t2, L(un_end)
+    sub.d       t7, t5, t6
+
+    sub.d       a2, a2, t5
+    sub.d       t6, t8, t7
+L(un_loop):
+    srl.d       t5, t4, a4
+    ldx.d       t4, a0, t8
+
+    ldx.d       t1, a1, t8
+    addi.d      t8, t8, 8
+    sll.d       t0, t4, a3
+    or          t0, t0, t5
+
+
+    bne         t0, t1, L(loop_not_equal)
+    sub.d       t2, t4, a5
+    andn        t3, a6, t4
+    sltui       t5, a2, 8
+
+    and         t2, t2, t3
+    addi.d      a2, a2, -8
+    or          t3, t2, t5
+    beqz        t3, L(un_loop)
+
+    addi.d      a2, a2, 8
+L(un_end):
+    sub.d       t2, t0, a5
+    andn        t3, a6, t0
+    sltu        t5, a2, t6
+
+    and         t2, t2, t3
+    or          t2, t2, t5
+    bnez        t2, L(ret0)
+    ldx.d       t1, a1, t8
+
+
+    srl.d       t0, t4, a4
+    sub.d       a2, a2, t6
+L(not_equal):
+    sub.d       t2, t0, a5
+    andn        t3, a6, t0
+
+    xor         t4, t0, t1
+    and         t2, t2, t3
+    or          t2, t2, t4
+    ctz.d       t2, t2
+
+    bstrins.d   t2, zero, 2, 0
+    srli.d      t4, t2, 3
+    srl.d       t0, t0, t2
+    srl.d       t1, t1, t2
+
+    andi        t0, t0, 0xff
+    andi        t1, t1, 0xff
+    sub.d       t2, t0, t1
+    sub.d       t3, t1, t0
+
+
+    masknez     t0, t2, a7
+    maskeqz     t1, t3, a7
+    sltu        t2, a2, t4
+    or          a0, t0, t1
+
+    masknez     a0, a0, t2
+    jr          ra
+L(loop_not_equal):
+    add.d       a2, a2, t7
+    b           L(not_equal)
+END(STRNCMP)
+
+libc_hidden_builtin_def (STRNCMP)
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
new file mode 100644
index 0000000000..595472fcda
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
@@ -0,0 +1,206 @@
+/* Optimized strncmp implementation using Loongarch LSX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRNCMP __strncmp_lsx
+
+L(magic_num):
+    .align          6
+    .dword          0x0706050403020100
+    .dword          0x0f0e0d0c0b0a0908
+ENTRY_NO_ALIGN(STRNCMP)
+    beqz            a2, L(ret0)
+    pcaddi          t0, -5
+    andi            a3, a0, 0xf
+    vld             vr2, t0, 0
+
+    andi            a4, a1, 0xf
+    li.d            t2, 16
+    bne             a3, a4, L(unaligned)
+    xor             t0, a0, a3
+
+    xor             t1, a1, a4
+    vld             vr0, t0, 0
+    vld             vr1, t1, 0
+    vreplgr2vr.b    vr3, a3
+
+
+    sub.d           t2, t2, a3
+    vadd.b          vr3, vr3, vr2
+    vshuf.b         vr0, vr3, vr0, vr3
+    vshuf.b         vr1, vr3, vr1, vr3
+
+    vseq.b          vr3, vr0, vr1
+    vmin.bu         vr3, vr0, vr3
+    bgeu            t2, a2, L(al_early_end)
+    vsetanyeqz.b    fcc0, vr3
+
+    bcnez           fcc0, L(al_end)
+    add.d           a3, a0, a2
+    addi.d          a4, a3, -1
+    bstrins.d       a4, zero, 3, 0
+
+    sub.d           a2, a3, a4
+L(al_loop):
+    vld             vr0, t0, 16
+    vld             vr1, t1, 16
+    addi.d          t0, t0, 16
+
+
+    addi.d          t1, t1, 16
+    vseq.b          vr3, vr0, vr1
+    vmin.bu         vr3, vr0, vr3
+    beq             t0, a4, L(al_early_end)
+
+    vsetanyeqz.b    fcc0, vr3
+    bceqz           fcc0, L(al_loop)
+L(al_end):
+    vseqi.b         vr3, vr3, 0
+    vfrstpi.b       vr3, vr3, 0
+
+    vshuf.b         vr0, vr0, vr0, vr3
+    vshuf.b         vr1, vr1, vr1, vr3
+    vpickve2gr.bu   t0, vr0, 0
+    vpickve2gr.bu   t1, vr1, 0
+
+    sub.d           a0, t0, t1
+    jr              ra
+L(al_early_end):
+    vreplgr2vr.b    vr4, a2
+    vslt.b          vr4, vr2, vr4
+
+
+    vorn.v          vr3, vr3, vr4
+    b               L(al_end)
+L(unaligned):
+    slt             a5, a3, a4
+    xor             t0, a0, a1
+
+    maskeqz         t0, t0, a5
+    xor             a0, a0, t0
+    xor             a1, a1, t0
+    andi            a3, a0, 0xf
+
+    andi            a4, a1, 0xf
+    xor             t0, a0, a3
+    xor             t1, a1, a4
+    vld             vr0, t0, 0
+
+    vld             vr3, t1, 0
+    sub.d           t2, t2, a3
+    vreplgr2vr.b    vr4, a3
+    vreplgr2vr.b    vr5, a4
+
+
+    vaddi.bu        vr6, vr2, 16
+    vsub.b          vr7, vr4, vr5
+    vsub.b          vr6, vr6, vr7
+    vadd.b          vr4, vr2, vr4
+
+    vshuf.b         vr1, vr3, vr3, vr6
+    vshuf.b         vr0, vr7, vr0, vr4
+    vshuf.b         vr1, vr7, vr1, vr4
+    vseq.b          vr4, vr0, vr1
+
+    vmin.bu         vr4, vr0, vr4
+    bgeu            t2, a2, L(un_early_end)
+    vsetanyeqz.b    fcc0, vr4
+    bcnez           fcc0, L(un_end)
+
+    add.d           a6, a0, a2
+    vslt.b          vr5, vr2, vr5
+    addi.d          a7, a6, -1
+    vor.v           vr3, vr3, vr5
+
+
+    bstrins.d       a7, zero, 3, 0
+    sub.d           a2, a6, a7
+L(un_loop):
+    vld             vr0, t0, 16
+    addi.d          t0, t0, 16
+
+    vsetanyeqz.b    fcc0, vr3
+    bcnez           fcc0, L(has_zero)
+    beq             t0, a7, L(end_with_len)
+    vor.v           vr1, vr3, vr3
+
+    vld             vr3, t1, 16
+    addi.d          t1, t1, 16
+    vshuf.b         vr1, vr3, vr1, vr6
+    vseq.b          vr4, vr0, vr1
+
+    vmin.bu         vr4, vr0, vr4
+    vsetanyeqz.b    fcc0, vr4
+    bceqz           fcc0, L(un_loop)
+L(un_end):
+    vseqi.b         vr4, vr4, 0
+
+
+    vfrstpi.b       vr4, vr4, 0
+    vshuf.b         vr0, vr0, vr0, vr4
+    vshuf.b         vr1, vr1, vr1, vr4
+    vpickve2gr.bu   t0, vr0, 0
+
+    vpickve2gr.bu   t1, vr1, 0
+    sub.d           t2, t0, t1
+    sub.d           t3, t1, t0
+    masknez         t0, t2, a5
+
+    maskeqz         t1, t3, a5
+    or              a0, t0, t1
+    jr              ra
+L(has_zero):
+    vshuf.b         vr1, vr3, vr3, vr6
+
+    vseq.b          vr4, vr0, vr1
+    vmin.bu         vr4, vr0, vr4
+    bne             t0, a7, L(un_end)
+L(un_early_end):
+    vreplgr2vr.b    vr5, a2
+
+    vslt.b          vr5, vr2, vr5
+    vorn.v          vr4, vr4, vr5
+    b               L(un_end)
+L(end_with_len):
+    sub.d           a6, a3, a4
+
+    bgeu            a6, a2, 1f
+    vld             vr4, t1, 16
+1:
+    vshuf.b         vr1, vr4, vr3, vr6
+    vseq.b          vr4, vr0, vr1
+
+    vmin.bu         vr4, vr0, vr4
+    vreplgr2vr.b    vr5, a2
+    vslt.b          vr5, vr2, vr5
+    vorn.v          vr4, vr4, vr5
+
+    b               L(un_end)
+L(ret0):
+    move            a0, zero
+    jr              ra
+END(STRNCMP)
+
+libc_hidden_builtin_def (STRNCMP)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp.c b/sysdeps/loongarch/lp64/multiarch/strncmp.c
new file mode 100644
index 0000000000..af6d0bc4a7
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strncmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define strncmp __redirect_strncmp
+# include <string.h>
+# undef strncmp
+
+# define SYMBOL_NAME strncmp
+# include "ifunc-strncmp.h"
+
+libc_ifunc_redirected (__redirect_strncmp, strncmp, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strncmp, __GI_strncmp, __redirect_strncmp)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strncmp);
+# endif
+#endif
-- 
2.40.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx}
  2023-08-22  2:11 ` [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx} dengjianbo
@ 2023-08-22  3:56   ` Richard Henderson
  2023-08-22  6:37     ` dengjianbo
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Henderson @ 2023-08-22  3:56 UTC (permalink / raw)
  To: dengjianbo, libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei

On 8/21/23 19:11, dengjianbo wrote:
> +L(magic_num):
> +    .align          6
> +    .dword          0x0706050403020100
> +    .dword          0x0f0e0d0c0b0a0908
> +ENTRY_NO_ALIGN(STRNCMP)
> +    beqz            a2, L(ret0)
> +    pcaddi          t0, -5
> +    andi            a3, a0, 0xf
> +    vld             vr2, t0, 0

Why is the data not in .rodata or a mergable constant section?

You can use pcalau12i and %pc_lo12 with vld to place this data anywhere.


r~

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx}
  2023-08-22  3:56   ` Richard Henderson
@ 2023-08-22  6:37     ` dengjianbo
  2023-08-22 11:13       ` Xi Ruoyao
  0 siblings, 1 reply; 9+ messages in thread
From: dengjianbo @ 2023-08-22  6:37 UTC (permalink / raw)
  To: Richard Henderson, libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei

[-- Attachment #1: Type: text/plain, Size: 742 bytes --]


On 2023-08-22 11:56, Richard Henderson wrote:
>> +L(magic_num):
>> +    .align          6
>> +    .dword          0x0706050403020100
>> +    .dword          0x0f0e0d0c0b0a0908
>> +ENTRY_NO_ALIGN(STRNCMP)
>> +    beqz            a2, L(ret0)
>> +    pcaddi          t0, -5
>> +    andi            a3, a0, 0xf
>> +    vld             vr2, t0, 0
>
> Why is the data not in .rodata or a mergable constant section?
>
> You can use pcalau12i and %pc_lo12 with vld to place this data anywhere.
>
>
> r~ 

Putting the data here is due to the performance. When the vld
instruction is executed, the data will be in the cache, it can
speed up the data loading.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx}
  2023-08-22  6:37     ` dengjianbo
@ 2023-08-22 11:13       ` Xi Ruoyao
  2023-08-22 11:23         ` Xi Ruoyao
  0 siblings, 1 reply; 9+ messages in thread
From: Xi Ruoyao @ 2023-08-22 11:13 UTC (permalink / raw)
  To: dengjianbo, Richard Henderson, libc-alpha
  Cc: adhemerval.zanella, caiyinyu, xuchenghua, huangpei

On Tue, 2023-08-22 at 14:37 +0800, dengjianbo wrote:

> Putting the data here is due to the performance. When the vld
>  instruction is executed, the data will be in the cache, it can
>  speed up the data loading. 

AFAIK LoongArch CPUs have separate icache and dcache like all modern
CPUs, so this is not valid to me.

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx}
  2023-08-22 11:13       ` Xi Ruoyao
@ 2023-08-22 11:23         ` Xi Ruoyao
  2023-08-23  7:25           ` dengjianbo
  0 siblings, 1 reply; 9+ messages in thread
From: Xi Ruoyao @ 2023-08-22 11:23 UTC (permalink / raw)
  To: dengjianbo, Richard Henderson, libc-alpha
  Cc: adhemerval.zanella, caiyinyu, xuchenghua, huangpei

On Tue, 2023-08-22 at 19:13 +0800, Xi Ruoyao via Libc-alpha wrote:
> On Tue, 2023-08-22 at 14:37 +0800, dengjianbo wrote:
> 
> > Putting the data here is due to the performance. When the vld
> >  instruction is executed, the data will be in the cache, it can
> >  speed up the data loading. 
> 
> AFAIK LoongArch CPUs have separate icache and dcache like all modern
> CPUs, so this is not valid to me.

And even if it can really improve the performance, this is not on the
hot path of the algorithm so we should not use bizarre optimizations
here for marginal improvement.

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx}
  2023-08-22 11:23         ` Xi Ruoyao
@ 2023-08-23  7:25           ` dengjianbo
  0 siblings, 0 replies; 9+ messages in thread
From: dengjianbo @ 2023-08-23  7:25 UTC (permalink / raw)
  To: Xi Ruoyao, Richard Henderson, libc-alpha
  Cc: adhemerval.zanella, caiyinyu, xuchenghua, huangpei

[-- Attachment #1: Type: text/plain, Size: 2170 bytes --]


On 2023-08-22 19:23, Xi Ruoyao wrote:
> On Tue, 2023-08-22 at 19:13 +0800, Xi Ruoyao via Libc-alpha wrote:
>> On Tue, 2023-08-22 at 14:37 +0800, dengjianbo wrote:
>>
>>> Putting the data here is due to the performance. When the vld
>>>  instruction is executed, the data will be in the cache, it can
>>>  speed up the data loading. 
>> AFAIK LoongArch CPUs have separate icache and dcache like all modern
>> CPUs, so this is not valid to me.
> And even if it can really improve the performance, this is not on the
> hot path of the algorithm so we should not use bizarre optimizations
> here for marginal improvement.
>
> -- Xi Ruoyao <xry111@xry111.site> School of Aerospace Science and Technology, Xidian University
Thanks for your suggestion. We have changed strcmp and strncmp to put
the data in the rodata section with mergeable flags, and also use pcalau12i
and %pc_lo12 with the vld to get the data.

diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S

index 595472fcda..0b4eee2a98 100644
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
@@ -25,15 +25,11 @@
 
 # define STRNCMP __strncmp_lsx
 
-L(magic_num):
-    .align          6
-    .dword          0x0706050403020100
-    .dword          0x0f0e0d0c0b0a0908
-ENTRY_NO_ALIGN(STRNCMP)
+LEAF(STRNCMP, 6)
     beqz            a2, L(ret0)
-    pcaddi          t0, -5
+    pcalau12i       t0, %pc_hi20(L(INDEX))
     andi            a3, a0, 0xf
-    vld             vr2, t0, 0
+    vld             vr2, t0, %pc_lo12(L(INDEX))
 
     andi            a4, a1, 0xf
     li.d            t2, 16
@@ -202,5 +198,11 @@ L(ret0):
     jr              ra
 END(STRNCMP)
 
+    .section         .rodata.cst16,"M",@progbits,16
+    .align           4
+L(INDEX):
+    .dword           0x0706050403020100
+    .dword           0x0f0e0d0c0b0a0908
+
 libc_hidden_builtin_def (STRNCMP)
 #endif


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-08-23  7:25 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-22  2:11 [PATCH 0/3] Add ifunc support for str{nlen, cmp, ncmp} dengjianbo
2023-08-22  2:11 ` [PATCH 1/3] Loongarch: Add ifunc support for strnlen{aligned, lsx, lasx} dengjianbo
2023-08-22  2:11 ` [PATCH 2/3] Loongarch: Add ifunc support for strcmp{aligned, lsx} dengjianbo
2023-08-22  2:11 ` [PATCH 3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx} dengjianbo
2023-08-22  3:56   ` Richard Henderson
2023-08-22  6:37     ` dengjianbo
2023-08-22 11:13       ` Xi Ruoyao
2023-08-22 11:23         ` Xi Ruoyao
2023-08-23  7:25           ` dengjianbo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).