public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v2 0/3] LoongArch: Add ifunc support for str{cp, rchr},
@ 2023-09-13  7:34 dengjianbo
  2023-09-13  7:34 ` [PATCH v2 1/3] LoongArch: Add ifunc support for strcpy, stpcpy{aligned, unaligned, lsx, lasx} dengjianbo
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: dengjianbo @ 2023-09-13  7:34 UTC (permalink / raw)
  To: libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei, dengjianbo

This patch add mutiple versions of strcpy, stpcpy, strrchr implemented
by basic LoongArch instructions, LSX instructions, LASX instructions.
Even though this implementation experience degradation in a few cases,
overall, the performance gains are significant.

See:
https://github.com/jiadengx/glibc_test/blob/main/bench/strcpy_compare_v2.out
https://github.com/jiadengx/glibc_test/blob/main/bench/stpcpy_compare_v2.out

Test results are compared with generic strcpy and stpcpy, not strlen +
memcpy in the benchmark.

Generic strrchr is implemented by strlen + memrchr, the strrchr_lasx
will be compared with generic_strrchr implemented by strlen-lasx and
memrchr-lasx, strrchr-lsx will be compared with generic_strrchr
implemented by strlen-lsx and memrchr-lsx, strrchr-aligned will be
compared with generic_strrchr implemented by strlen-aligned and
memrchr-generic.
https://github.com/jiadengx/glibc_test/blob/main/bench/strrchr_lasx_compare.out
https://github.com/jiadengx/glibc_test/blob/main/bench/strrchr_lsx_compare.out
https://github.com/jiadengx/glibc_test/blob/main/bench/strrchr_aligned_compare.out

In the data, positive values in the parentheses indicate that our
implementation took less time, indicating a performance improvement;
negative values in the parentheses mean that our implementation took
more time, indicating a decrease in performance. Following is the
summarise of the performance comparing with the generic version in the
glibc microbenchmark, 

name              reduce time percent
strcpy-aligned    8%-45%
strcpy-unaligned  8%-48%, comparing with the aligned version,unaligned
                  version experience better performance in case src and
                  dest cannot be both aligned with 8bytes
strcpy-lsx        20%-80%
strcpy-lasx       15%-86%

stpcpy-lasx       10%-87%
stpcpy-lsx        10%-80%
stpcpy-aligned    6%-43%
stpcpy-unaligned  8%-48%

strrchr-lasx      10%-50%
strrchr-lsx       0%-50%
strrchr-aligned   5%-50%

dengjianbo (3):
  LoongArch: Add ifunc support for strcpy, stpcpy{aligned, unaligned,
    lsx, lasx}
  LoongArch: Add ifunc support for strrchr{aligned, lsx, lasx}
  LoongArch: Change to put magic number to .rodata section

 sysdeps/loongarch/lp64/multiarch/Makefile     |  11 +
 .../lp64/multiarch/ifunc-impl-list.c          |  26 +++
 .../loongarch/lp64/multiarch/ifunc-strrchr.h  |  41 ++++
 .../loongarch/lp64/multiarch/memmove-lsx.S    |  20 +-
 .../loongarch/lp64/multiarch/stpcpy-aligned.S |  27 +++
 .../loongarch/lp64/multiarch/stpcpy-lasx.S    |  22 ++
 sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S |  22 ++
 .../lp64/multiarch/stpcpy-unaligned.S         |  22 ++
 sysdeps/loongarch/lp64/multiarch/stpcpy.c     |  42 ++++
 .../loongarch/lp64/multiarch/strcpy-aligned.S | 202 ++++++++++++++++
 .../loongarch/lp64/multiarch/strcpy-lasx.S    | 215 ++++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 212 +++++++++++++++++
 .../lp64/multiarch/strcpy-unaligned.S         | 138 +++++++++++
 sysdeps/loongarch/lp64/multiarch/strcpy.c     |  35 +++
 .../lp64/multiarch/strrchr-aligned.S          | 170 ++++++++++++++
 .../loongarch/lp64/multiarch/strrchr-lasx.S   | 176 ++++++++++++++
 .../loongarch/lp64/multiarch/strrchr-lsx.S    | 144 ++++++++++++
 sysdeps/loongarch/lp64/multiarch/strrchr.c    |  36 +++
 18 files changed, 1551 insertions(+), 10 deletions(-)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy.c
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy.c
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr.c

-- 
2.40.0


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v2 1/3] LoongArch: Add ifunc support for strcpy, stpcpy{aligned, unaligned, lsx, lasx}
  2023-09-13  7:34 [PATCH v2 0/3] LoongArch: Add ifunc support for str{cp, rchr}, dengjianbo
@ 2023-09-13  7:34 ` dengjianbo
  2023-09-13  7:35 ` [PATCH v2 2/3] LoongArch: Add ifunc support for strrchr{aligned, " dengjianbo
  2023-09-13  7:35 ` [PATCH v2 3/3] LoongArch: Change to put magic number to .rodata section dengjianbo
  2 siblings, 0 replies; 4+ messages in thread
From: dengjianbo @ 2023-09-13  7:34 UTC (permalink / raw)
  To: libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei, dengjianbo

According to glibc strcpy and stpcpy microbenchmark test results(changed
to use generic_strcpy and generic_stpcpy instead of strlen + memcpy),
comparing with the generic version, this implementation could reduce the
runtime as following:

Name              Percent of rutime reduced
strcpy-aligned    8%-45%
strcpy-unaligned  8%-48%, comparing with the aligned version, unaligned
                  version takes less instructions to copy the tail of data
		  which length is less than 8. it also has better performance
		  in case src and dest cannot be both aligned with 8bytes
strcpy-lsx        20%-80%
strcpy-lasx       15%-86%
stpcpy-aligned    6%-43%
stpcpy-unaligned  8%-48%
stpcpy-lsx        10%-80%
stpcpy-lasx       10%-87%
---
 sysdeps/loongarch/lp64/multiarch/Makefile     |   8 +
 .../lp64/multiarch/ifunc-impl-list.c          |  18 ++
 .../loongarch/lp64/multiarch/stpcpy-aligned.S |  27 +++
 .../loongarch/lp64/multiarch/stpcpy-lasx.S    |  22 ++
 sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S |  22 ++
 .../lp64/multiarch/stpcpy-unaligned.S         |  22 ++
 sysdeps/loongarch/lp64/multiarch/stpcpy.c     |  42 ++++
 .../loongarch/lp64/multiarch/strcpy-aligned.S | 202 ++++++++++++++++
 .../loongarch/lp64/multiarch/strcpy-lasx.S    | 215 ++++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 212 +++++++++++++++++
 .../lp64/multiarch/strcpy-unaligned.S         | 138 +++++++++++
 sysdeps/loongarch/lp64/multiarch/strcpy.c     |  35 +++
 12 files changed, 963 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/stpcpy.c
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strcpy.c

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 360a6718c0..39550bea9b 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -16,6 +16,14 @@ sysdep_routines += \
   strcmp-lsx \
   strncmp-aligned \
   strncmp-lsx \
+  strcpy-aligned \
+  strcpy-unaligned \
+  strcpy-lsx \
+  strcpy-lasx \
+  stpcpy-aligned \
+  stpcpy-unaligned \
+  stpcpy-lsx \
+  stpcpy-lasx \
   memcpy-aligned \
   memcpy-unaligned \
   memmove-unaligned \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index e397d58c9d..39a14f1d6e 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -76,6 +76,24 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
 	      )
 
+  IFUNC_IMPL (i, name, strcpy,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LASX, __strcpy_lasx)
+	      IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LSX, __strcpy_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_UAL, __strcpy_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned)
+	      )
+
+  IFUNC_IMPL (i, name, stpcpy,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LASX, __stpcpy_lasx)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LSX, __stpcpy_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_UAL, __stpcpy_unaligned)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
+	      )
+
   IFUNC_IMPL (i, name, memcpy,
 #if !defined __loongarch_soft_float
               IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
new file mode 100644
index 0000000000..1f763db685
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S
@@ -0,0 +1,27 @@
+/* stpcpy-aligned implementation is in strcpy-aligned.S.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define STPCPY __stpcpy_aligned
+#else
+# define STPCPY __stpcpy
+#endif
+
+#define USE_AS_STPCPY
+#define STRCPY STPCPY
+#include "strcpy-aligned.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S
new file mode 100644
index 0000000000..13d6c9539b
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lasx.S
@@ -0,0 +1,22 @@
+/* stpcpy-lasx implementation is in strcpy-lasx.S.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPCPY __stpcpy_lasx
+#define USE_AS_STPCPY
+#define STRCPY STPCPY
+#include "strcpy-lasx.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
new file mode 100644
index 0000000000..e0f17ab589
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
@@ -0,0 +1,22 @@
+/* stpcpy-lsx implementation is in strcpy-lsx.S.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPCPY __stpcpy_lsx
+#define USE_AS_STPCPY
+#define STRCPY STPCPY
+#include "strcpy-lsx.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S
new file mode 100644
index 0000000000..cc2f971214
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-unaligned.S
@@ -0,0 +1,22 @@
+/* stpcpy-unaligned implementation is in strcpy-unaligned.S.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPCPY __stpcpy_unaligned
+#define USE_AS_STPCPY
+#define STRCPY STPCPY
+#include "strcpy-unaligned.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy.c b/sysdeps/loongarch/lp64/multiarch/stpcpy.c
new file mode 100644
index 0000000000..d4860d7a47
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy.c
@@ -0,0 +1,42 @@
+/* Multiple versions of stpcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define stpcpy __redirect_stpcpy
+# define __stpcpy __redirect___stpcpy
+# define NO_MEMPCPY_STPCPY_REDIRECT
+# define __NO_STRING_INLINES
+# include <string.h>
+# undef stpcpy
+# undef __stpcpy
+
+# define SYMBOL_NAME stpcpy
+# include "ifunc-lasx.h"
+
+libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
+
+weak_alias (__stpcpy, stpcpy)
+# ifdef SHARED
+__hidden_ver1 (__stpcpy, __GI___stpcpy, __redirect___stpcpy)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (stpcpy);
+__hidden_ver1 (stpcpy, __GI_stpcpy, __redirect_stpcpy)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (stpcpy);
+# endif
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
new file mode 100644
index 0000000000..4ed539fdb4
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S
@@ -0,0 +1,202 @@
+/* Optimized strcpy stpcpy aligned implementation using basic LoongArch
+   instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# ifndef STRCPY
+#  define STRCPY __strcpy_aligned
+# endif
+#else
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+LEAF(STRCPY, 6)
+    andi        a3, a0, 0x7
+    move        a2, a0
+    beqz        a3, L(dest_align)
+    sub.d       a5, a1, a3
+    addi.d      a5, a5, 8
+
+L(make_dest_align):
+    ld.b        t0, a1, 0
+    addi.d      a1, a1, 1
+    st.b        t0, a2, 0
+    addi.d      a2, a2, 1
+    beqz        t0, L(al_out)
+
+    bne         a1, a5, L(make_dest_align)
+
+L(dest_align):
+    andi        a4, a1, 7
+    bstrins.d   a1, zero, 2, 0
+
+    lu12i.w     t5, 0x1010
+    ld.d        t0, a1, 0
+    ori         t5, t5, 0x101
+    bstrins.d   t5, t5, 63, 32
+
+    slli.d      t6, t5, 0x7
+    bnez        a4, L(unalign)
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+
+    and         t3, t1, t2
+    bnez        t3, L(al_end)
+
+L(al_loop):
+    st.d        t0, a2, 0
+    ld.d        t0, a1, 8
+
+    addi.d      a1, a1, 8
+    addi.d      a2, a2, 8
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+
+    and         t3, t1, t2
+    beqz        t3, L(al_loop)
+
+L(al_end):
+    ctz.d       t1, t3
+    srli.d      t1, t1, 3
+    addi.d      t1, t1, 1
+
+    andi        a3, t1, 8
+    andi        a4, t1, 4
+    andi        a5, t1, 2
+    andi        a6, t1, 1
+
+L(al_end_8):
+    beqz        a3, L(al_end_4)
+    st.d        t0, a2, 0
+#ifdef USE_AS_STPCPY
+    addi.d      a0, a2, 7
+#endif
+    jr          ra
+L(al_end_4):
+    beqz        a4, L(al_end_2)
+    st.w        t0, a2, 0
+    addi.d      a2, a2, 4
+    srli.d      t0, t0, 32
+L(al_end_2):
+    beqz        a5, L(al_end_1)
+    st.h        t0, a2, 0
+    addi.d      a2, a2, 2
+    srli.d      t0, t0, 16
+L(al_end_1):
+    beqz        a6, L(al_out)
+    st.b        t0, a2, 0
+    addi.d      a2, a2, 1
+L(al_out):
+#ifdef USE_AS_STPCPY
+    addi.d      a0, a2, -1
+#endif
+    jr          ra
+
+    .align      4
+L(unalign):
+    slli.d      a5, a4, 3
+    li.d        t1, -1
+    sub.d       a6, zero, a5
+
+    srl.d       a7, t0, a5
+    sll.d       t7, t1, a6
+
+    or          t0, a7, t7
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+    and         t3, t1, t2
+
+    bnez        t3, L(un_end)
+
+    ld.d        t4, a1, 8
+
+    sub.d       t1, t4, t5
+    andn        t2, t6, t4
+    sll.d       t0, t4, a6
+    and         t3, t1, t2
+
+    or          t0, t0, a7
+    bnez        t3, L(un_end_with_remaining)
+
+L(un_loop):
+    srl.d       a7, t4, a5
+
+    ld.d        t4, a1, 16
+    addi.d      a1, a1, 8
+
+    st.d        t0, a2, 0
+    addi.d      a2, a2, 8
+
+    sub.d       t1, t4, t5
+    andn        t2, t6, t4
+    sll.d       t0, t4, a6
+    and         t3, t1, t2
+
+    or          t0, t0, a7
+    beqz        t3, L(un_loop)
+
+L(un_end_with_remaining):
+    ctz.d       t1, t3
+    srli.d      t1, t1, 3
+    addi.d      t1, t1, 1
+    sub.d       t1, t1, a4
+
+    blt         t1, zero, L(un_end_less_8)
+    st.d        t0, a2, 0
+    addi.d      a2, a2, 8
+    beqz        t1, L(un_out)
+    srl.d       t0, t4, a5
+    b           L(un_end_less_8)
+
+L(un_end):
+    ctz.d       t1, t3
+    srli.d      t1, t1, 3
+    addi.d      t1, t1, 1
+
+L(un_end_less_8):
+    andi        a4, t1, 4
+    andi        a5, t1, 2
+    andi        a6, t1, 1
+L(un_end_4):
+    beqz        a4, L(un_end_2)
+    st.w        t0, a2, 0
+    addi.d      a2, a2, 4
+    srli.d      t0, t0, 32
+L(un_end_2):
+    beqz        a5, L(un_end_1)
+    st.h        t0, a2, 0
+    addi.d      a2, a2, 2
+    srli.d      t0, t0, 16
+L(un_end_1):
+    beqz        a6, L(un_out)
+    st.b        t0, a2, 0
+    addi.d      a2, a2, 1
+L(un_out):
+#ifdef USE_AS_STPCPY
+    addi.d      a0, a2, -1
+#endif
+    jr          ra
+END(STRCPY)
+
+libc_hidden_builtin_def (STRCPY)
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
new file mode 100644
index 0000000000..c282561253
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lasx.S
@@ -0,0 +1,215 @@
+/* Optimized strcpy stpcpy implementation using LoongArch LASX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# ifndef STRCPY
+#  define STRCPY __strcpy_lasx
+# endif
+
+# ifdef USE_AS_STPCPY
+#  define dstend a0
+# else
+#  define dstend a4
+# endif
+
+LEAF(STRCPY, 6)
+    ori             t8, zero, 0xfe0
+    andi            t0, a1, 0xfff
+    li.d            t7, -1
+    move            a2, a0
+
+    bltu            t8, t0, L(page_cross_start)
+L(start_entry):
+    xvld            xr0, a1, 0
+    li.d            t0, 32
+    andi            t1, a2, 0x1f
+
+    xvsetanyeqz.b   fcc0, xr0
+    sub.d           t0, t0, t1
+    bcnez           fcc0, L(end)
+    add.d           a1, a1, t0
+
+    xvst            xr0, a2, 0
+    andi            a3, a1, 0x1f
+    add.d           a2, a2, t0
+    bnez            a3, L(unaligned)
+
+
+    xvld            xr0, a1, 0
+    xvsetanyeqz.b   fcc0, xr0
+    bcnez           fcc0, L(al_end)
+L(al_loop):
+    xvst            xr0, a2, 0
+
+    xvld            xr0, a1, 32
+    addi.d          a2, a2, 32
+    addi.d          a1, a1, 32
+    xvsetanyeqz.b   fcc0, xr0
+
+    bceqz           fcc0, L(al_loop)
+L(al_end):
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+    cto.w           t0, t0
+    add.d           a1, a1, t0
+    xvld            xr0, a1, -31
+
+
+    add.d           dstend, a2, t0
+    xvst            xr0, dstend, -31
+    jr              ra
+    nop
+
+L(page_cross_start):
+    move            a4, a1
+    bstrins.d       a4, zero, 4, 0
+    xvld            xr0, a4, 0
+    xvmsknz.b       xr0, xr0
+
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+    movfr2gr.s      t0, fa0
+    sra.w           t0, t0, a1
+
+    beq             t0, t7, L(start_entry)
+    b               L(tail)
+L(unaligned):
+    andi            t0, a1, 0xfff
+    bltu            t8, t0, L(un_page_cross)
+
+
+L(un_start_entry):
+    xvld            xr0, a1, 0
+    xvsetanyeqz.b   fcc0, xr0
+    bcnez           fcc0, L(un_end)
+    addi.d          a1, a1, 32
+
+L(un_loop):
+    xvst            xr0, a2, 0
+    andi            t0, a1, 0xfff
+    addi.d          a2, a2, 32
+    bltu            t8, t0, L(page_cross_loop)
+
+L(un_loop_entry):
+    xvld            xr0, a1, 0
+    addi.d          a1, a1, 32
+    xvsetanyeqz.b   fcc0, xr0
+    bceqz           fcc0, L(un_loop)
+
+    addi.d          a1, a1, -32
+L(un_end):
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+
+    movfr2gr.s      t0, fa0
+L(un_tail):
+    cto.w           t0, t0
+    add.d           a1, a1, t0
+    xvld            xr0, a1, -31
+
+    add.d           dstend, a2, t0
+    xvst            xr0, dstend, -31
+    jr              ra
+L(un_page_cross):
+    sub.d           a4, a1, a3
+
+    xvld            xr0, a4, 0
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+    sra.w           t0, t0, a1
+    beq             t0, t7, L(un_start_entry)
+    b               L(un_tail)
+
+
+L(page_cross_loop):
+    sub.d           a4, a1, a3
+    xvld            xr0, a4, 0
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+
+    vilvl.h         vr0, vr1, vr0
+    movfr2gr.s      t0, fa0
+    sra.w           t0, t0, a1
+    beq             t0, t7, L(un_loop_entry)
+
+    b               L(un_tail)
+L(end):
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+L(tail):
+    cto.w           t0, t0
+    add.d           dstend, a2, t0
+    add.d           a5, a1, t0
+
+L(less_32):
+    srli.d          t1, t0, 4
+    beqz            t1, L(less_16)
+    vld             vr0, a1, 0
+    vld             vr1, a5, -15
+
+    vst             vr0, a2, 0
+    vst             vr1, dstend, -15
+    jr              ra
+L(less_16):
+    srli.d          t1, t0, 3
+
+    beqz            t1, L(less_8)
+    ld.d            t2, a1, 0
+    ld.d            t3, a5, -7
+    st.d            t2, a2, 0
+
+    st.d            t3, dstend, -7
+    jr              ra
+L(less_8):
+    li.d            t1, 3
+    bltu            t0, t1, L(less_3)
+
+    ld.w            t2, a1, 0
+    ld.w            t3, a5, -3
+    st.w            t2, a2, 0
+    st.w            t3, dstend, -3
+
+    jr              ra
+L(less_3):
+    beqz            t0, L(zero_byte)
+    ld.h            t2, a1, 0
+
+    st.h            t2, a2, 0
+L(zero_byte):
+    st.b            zero, dstend, 0
+    jr              ra
+END(STRCPY)
+
+libc_hidden_builtin_def (STRCPY)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
new file mode 100644
index 0000000000..fc2498f7db
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
@@ -0,0 +1,212 @@
+/* Optimized strcpy stpcpy implementation using LoongArch LSX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# ifndef STRCPY
+#  define STRCPY __strcpy_lsx
+# endif
+
+LEAF(STRCPY, 6)
+    pcalau12i       t0, %pc_hi20(L(INDEX))
+    andi            a4, a1, 0xf
+    vld             vr1, t0, %pc_lo12(L(INDEX))
+    move            a2, a0
+
+    beqz            a4, L(load_start)
+    xor             t0, a1, a4
+    vld             vr0, t0, 0
+    vreplgr2vr.b    vr2, a4
+
+    vadd.b          vr2, vr2, vr1
+    vshuf.b         vr0, vr2, vr0, vr2
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(end)
+
+L(load_start):
+    vld             vr0, a1, 0
+    li.d            t1, 16
+    andi            a3, a2, 0xf
+    vsetanyeqz.b    fcc0, vr0
+
+
+    sub.d           t0, t1, a3
+    bcnez           fcc0, L(end)
+    add.d           a1, a1, t0
+    vst             vr0, a2, 0
+
+    andi            a3, a1, 0xf
+    add.d           a2, a2, t0
+    bnez            a3, L(unaligned)
+    vld             vr0, a1, 0
+
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(al_end)
+L(al_loop):
+    vst             vr0, a2, 0
+    vld             vr0, a1, 16
+
+    addi.d          a2, a2, 16
+    addi.d          a1, a1, 16
+    vsetanyeqz.b    fcc0, vr0
+    bceqz           fcc0, L(al_loop)
+
+
+L(al_end):
+    vmsknz.b        vr1, vr0
+    movfr2gr.s      t0, fa1
+    cto.w           t0, t0
+    add.d           a1, a1, t0
+
+    vld             vr0, a1, -15
+# ifdef USE_AS_STPCPY
+    add.d           a0, a2, t0
+    vst             vr0, a0, -15
+# else
+    add.d           a2, a2, t0
+    vst             vr0, a2, -15
+# endif
+    jr              ra
+
+L(end):
+    vmsknz.b        vr1, vr0
+    movfr2gr.s      t0, fa1
+    cto.w           t0, t0
+    addi.d          t0, t0, 1
+
+L(end_16):
+    andi            t1, t0, 16
+    beqz            t1, L(end_8)
+    vst             vr0, a2, 0
+# ifdef USE_AS_STPCPY
+    addi.d          a0, a2, 15
+# endif
+    jr              ra
+
+L(end_8):
+    andi            t2, t0, 8
+    andi            t3, t0, 4
+    andi            t4, t0, 2
+    andi            t5, t0, 1
+
+    beqz            t2, L(end_4)
+    vstelm.d        vr0, a2, 0, 0
+    addi.d          a2, a2, 8
+    vbsrl.v         vr0, vr0, 8
+
+L(end_4):
+    beqz            t3, L(end_2)
+    vstelm.w        vr0, a2, 0, 0
+    addi.d          a2, a2, 4
+    vbsrl.v         vr0, vr0, 4
+
+L(end_2):
+    beqz            t4, L(end_1)
+    vstelm.h        vr0, a2, 0, 0
+    addi.d          a2, a2, 2
+    vbsrl.v         vr0, vr0, 2
+
+
+L(end_1):
+    beqz            t5, L(out)
+    vstelm.b        vr0, a2, 0, 0
+    addi.d          a2, a2, 1
+L(out):
+# ifdef USE_AS_STPCPY
+    addi.d          a0, a2, -1
+# endif
+    jr              ra
+
+    .align          4
+L(unaligned):
+    bstrins.d       a1, zero, 3, 0
+    vld             vr2, a1, 0
+    vreplgr2vr.b    vr3, a3
+    vslt.b          vr4, vr1, vr3
+
+    vor.v           vr0, vr2, vr4
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(un_first_end)
+    vld             vr0, a1, 16
+
+    vadd.b          vr3, vr3, vr1
+    vshuf.b         vr4, vr0, vr2, vr3
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(un_end)
+
+
+    vor.v           vr2, vr0, vr0
+    addi.d          a1, a1, 16
+L(un_loop):
+    vld             vr0, a1, 16
+    vst             vr4, a2, 0
+
+    addi.d          a2, a2, 16
+    vshuf.b         vr4, vr0, vr2, vr3
+    vsetanyeqz.b    fcc0, vr0
+    bcnez           fcc0, L(un_end)
+
+    vld             vr2, a1, 32
+    vst             vr4, a2, 0
+    addi.d          a1, a1, 32
+    addi.d          a2, a2, 16
+
+    vshuf.b         vr4, vr2, vr0, vr3
+    vsetanyeqz.b    fcc0, vr2
+    bceqz           fcc0, L(un_loop)
+    vor.v           vr0, vr2, vr2
+
+
+    addi.d          a1, a1, -16
+L(un_end):
+    vsetanyeqz.b    fcc0, vr4
+    bcnez           fcc0, 1f
+    vst             vr4, a2, 0
+
+1:
+    vmsknz.b        vr1, vr0
+    movfr2gr.s      t0, fa1
+    cto.w           t0, t0
+    add.d           a1, a1, t0
+
+    vld             vr0, a1, 1
+    add.d           a2, a2, t0
+    sub.d           a2, a2, a3
+    vst             vr0, a2, 1
+# ifdef USE_AS_STPCPY
+    addi.d          a0, a2, 16
+# endif
+    jr              ra
+L(un_first_end):
+    addi.d          a2, a2, -16
+    addi.d          a1, a1, -16
+    b               1b
+END(STRCPY)
+
+    .section        .rodata.cst16,"M",@progbits,16
+    .align          4
+L(INDEX):
+    .dword          0x0706050403020100
+    .dword          0x0f0e0d0c0b0a0908
+
+libc_hidden_builtin_def (STRCPY)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
new file mode 100644
index 0000000000..9e31883bb1
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
@@ -0,0 +1,138 @@
+/* Optimized strcpy unaligned implementation using basic LoongArch
+   instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+
+# ifndef STRCPY
+#  define STRCPY __strcpy_unaligned
+# endif
+
+# ifdef USE_AS_STPCPY
+#  define dstend a0
+# else
+#  define dstend a4
+# endif
+
+LEAF(STRCPY, 6)
+    lu12i.w     t5, 0x01010
+    li.w        t0, 0xff8
+    ori         t5, t5, 0x101
+    andi        t1, a1, 0xfff
+
+    bstrins.d   t5, t5, 63, 32
+    move        a2, a0
+    slli.d      t6, t5, 7
+    bltu        t0, t1, L(page_cross)
+
+L(start_entry):
+    ld.d        t0, a1, 0
+    li.d        t3, 8
+    andi        a3, a1, 0x7
+    sub.d       t1, t0, t5
+
+    andn        t2, t6, t0
+    sub.d       t3, t3, a3
+    and         t1, t1, t2
+    bnez        t1, L(end)
+
+
+    add.d       a1, a1, t3
+    st.d        t0, a2, 0
+    add.d       a2, a2, t3
+    ld.d        t0, a1, 0
+
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+    and         t1, t1, t2
+    bnez        t1, L(long_end)
+
+L(loop):
+    st.d        t0, a2, 0
+    ld.d        t0, a1, 8
+    addi.d      a2, a2, 8
+    addi.d      a1, a1, 8
+
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+    and         t1, t1, t2
+    beqz        t1, L(loop)
+
+
+L(long_end):
+    ctz.d       t1, t1
+    srli.d      t1, t1, 3
+    add.d       a1, a1, t1
+    ld.d        t0, a1, -7
+
+    add.d       dstend, a2, t1
+    st.d        t0, dstend, -7
+    jr          ra
+    nop
+
+L(end):
+    ctz.d       t1, t1
+    srli.d      t1, t1, 3
+    add.d       a3, a1, t1
+    add.d       dstend, a2, t1
+
+L(less_8):
+    li.d        t0, 3
+    bltu        t1, t0, L(less_3)
+    ld.w        t1, a1, 0
+    ld.w        t2, a3, -3
+
+
+    st.w        t1, a2, 0
+    st.w        t2, dstend, -3
+    jr          ra
+L(less_3):
+    beqz        t1, L(zero_bytes)
+
+    ld.h        t1, a1, 0
+    st.h        t1, a2, 0
+L(zero_bytes):
+    st.b        zero, dstend, 0
+    jr          ra
+
+L(page_cross):
+    move        a4, a1
+    bstrins.d   a4, zero, 2, 0
+    ld.d        t0, a4, 0
+    li.d        t3, -1
+
+    slli.d      t4, a1, 3
+    srl.d       t3, t3, t4
+    srl.d       t0, t0, t4
+    orn         t0, t0, t3
+
+
+    sub.d       t1, t0, t5
+    andn        t2, t6, t0
+    and         t1, t1, t2
+    beqz        t1, L(start_entry)
+
+    b           L(end)
+END(STRCPY)
+
+libc_hidden_builtin_def (STRCPY)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy.c b/sysdeps/loongarch/lp64/multiarch/strcpy.c
new file mode 100644
index 0000000000..46afd068f9
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define strcpy __redirect_strcpy
+# include <string.h>
+# undef strcpy
+
+# define SYMBOL_NAME strcpy
+# include "ifunc-lasx.h"
+
+libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strcpy, __GI_strcpy, __redirect_strcpy)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strcpy);
+# endif
+#endif
-- 
2.40.0


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v2 2/3] LoongArch: Add ifunc support for strrchr{aligned, lsx, lasx}
  2023-09-13  7:34 [PATCH v2 0/3] LoongArch: Add ifunc support for str{cp, rchr}, dengjianbo
  2023-09-13  7:34 ` [PATCH v2 1/3] LoongArch: Add ifunc support for strcpy, stpcpy{aligned, unaligned, lsx, lasx} dengjianbo
@ 2023-09-13  7:35 ` dengjianbo
  2023-09-13  7:35 ` [PATCH v2 3/3] LoongArch: Change to put magic number to .rodata section dengjianbo
  2 siblings, 0 replies; 4+ messages in thread
From: dengjianbo @ 2023-09-13  7:35 UTC (permalink / raw)
  To: libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei, dengjianbo

According to glibc strrchr microbenchmark test results, this implementation
could reduce the runtime time as following:

Name                Percent of rutime reduced
strrchr-lasx        10%-50%
strrchr-lsx         0%-50%
strrchr-aligned     5%-50%

Generic strrchr is implemented by function strlen + memrchr, the lasx version
will compare with generic strrchr implemented by strlen-lasx + memrchr-lasx,
the lsx version will compare with generic strrchr implemented by strlen-lsx +
memrchr-lsx, the aligned version will compare with generic strrchr implemented
by strlen-aligned + memrchr-generic.
---
 sysdeps/loongarch/lp64/multiarch/Makefile     |   3 +
 .../lp64/multiarch/ifunc-impl-list.c          |   8 +
 .../loongarch/lp64/multiarch/ifunc-strrchr.h  |  41 ++++
 .../lp64/multiarch/strrchr-aligned.S          | 170 +++++++++++++++++
 .../loongarch/lp64/multiarch/strrchr-lasx.S   | 176 ++++++++++++++++++
 .../loongarch/lp64/multiarch/strrchr-lsx.S    | 144 ++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strrchr.c    |  36 ++++
 7 files changed, 578 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr.c

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 39550bea9b..fe863e1ba4 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -9,6 +9,9 @@ sysdep_routines += \
   strchr-aligned \
   strchr-lsx \
   strchr-lasx \
+  strrchr-aligned \
+  strrchr-lsx \
+  strrchr-lasx \
   strchrnul-aligned \
   strchrnul-lsx \
   strchrnul-lasx \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 39a14f1d6e..529e236950 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -94,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
 	      )
 
+  IFUNC_IMPL (i, name, strrchr,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LASX, __strrchr_lasx)
+	      IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LSX, __strrchr_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned)
+	      )
+
   IFUNC_IMPL (i, name, memcpy,
 #if !defined __loongarch_soft_float
               IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
new file mode 100644
index 0000000000..bbb34089ef
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
@@ -0,0 +1,41 @@
+/* Common definition for strrchr ifunc selections.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+  if (SUPPORT_LASX)
+    return OPTIMIZE (lasx);
+  else if (SUPPORT_LSX)
+    return OPTIMIZE (lsx);
+  else
+#endif
+    return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
new file mode 100644
index 0000000000..a73deb7840
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
@@ -0,0 +1,170 @@
+/* Optimized strrchr implementation using basic LoongArch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRRCHR __strrchr_aligned
+#else
+# define STRRCHR strrchr
+#endif
+
+LEAF(STRRCHR, 6)
+    slli.d      t0, a0, 3
+    bstrins.d   a0, zero, 2, 0
+    lu12i.w     a2, 0x01010
+    ld.d        t2, a0, 0
+
+    andi        a1, a1, 0xff
+    ori         a2, a2, 0x101
+    li.d        t3, -1
+    bstrins.d	a2, a2, 63, 32
+
+    sll.d       t5, t3, t0
+    slli.d      a3, a2, 7
+    orn         t4, t2, t5
+    mul.d       a1, a1, a2
+
+    sub.d       t0, t4, a2
+    andn        t1, a3, t4
+    and         t1, t0, t1
+    beqz        t1, L(find_tail)
+
+
+    ctz.d       t0, t1
+    orn         t0, zero, t0
+    xor         t2, t4, a1
+    srl.d       t0, t3, t0
+
+    orn         t2, t2, t0
+    orn         t2, t2, t5
+    revb.d      t2, t2
+    sub.d       t1, t2, a2
+
+    andn        t0, a3, t2
+    and         t1, t0, t1
+    ctz.d       t0, t1
+    srli.d      t0, t0, 3
+
+    addi.d      a0, a0, 7
+    sub.d       a0, a0, t0
+    maskeqz     a0, a0, t1
+    jr          ra
+
+
+L(find_tail):
+    addi.d      a4, a0, 8
+    addi.d      a0, a0, 8
+L(loop_ascii):
+    ld.d        t2, a0, 0
+    sub.d       t1, t2, a2
+
+    and         t0, t1, a3
+    bnez        t0, L(more_check)
+    ld.d        t2, a0, 8
+    sub.d       t1, t2, a2
+
+    and         t0, t1, a3
+    addi.d      a0, a0, 16
+    beqz        t0, L(loop_ascii)
+    addi.d      a0, a0, -8
+
+L(more_check):
+    andn        t0, a3, t2
+    and         t1, t1, t0
+    bnez        t1, L(tail)
+    addi.d      a0, a0, 8
+
+
+L(loop_nonascii):
+    ld.d        t2, a0, 0
+    sub.d       t1, t2, a2
+    andn        t0, a3, t2
+    and         t1, t0, t1
+
+    bnez        t1, L(tail)
+    ld.d        t2, a0, 8
+    addi.d      a0, a0, 16
+    sub.d       t1, t2, a2
+
+    andn        t0, a3, t2
+    and         t1, t0, t1
+    beqz        t1, L(loop_nonascii)
+    addi.d      a0, a0, -8
+
+L(tail):
+    ctz.d       t0, t1
+    orn         t0, zero, t0
+    xor         t2, t2, a1
+    srl.d       t0, t3, t0
+
+
+    orn         t2, t2, t0
+    revb.d      t2, t2
+    sub.d       t1, t2, a2
+    andn        t0, a3, t2
+
+    and         t1, t0, t1
+    bnez        t1, L(count_pos)
+L(find_loop):
+    beq         a0, a4, L(find_end)
+    ld.d        t2, a0, -8
+
+    addi.d      a0, a0, -8
+    xor         t2, t2, a1
+    sub.d       t1, t2, a2
+    andn        t0, a3, t2
+
+    and         t1, t0, t1
+    beqz        t1, L(find_loop)
+    revb.d      t2, t2
+    sub.d       t1, t2, a2
+
+
+    andn        t0, a3, t2
+    and         t1, t0, t1
+L(count_pos):
+    ctz.d       t0, t1
+    addi.d      a0, a0, 7
+
+    srli.d      t0, t0, 3
+    sub.d       a0, a0, t0
+    jr          ra
+    nop
+
+L(find_end):
+    xor         t2, t4, a1
+    orn         t2, t2, t5
+    revb.d      t2, t2
+    sub.d       t1, t2, a2
+
+
+    andn        t0, a3, t2
+    and         t1, t0, t1
+    ctz.d       t0, t1
+    srli.d      t0, t0, 3
+
+    addi.d      a0, a4, -1
+    sub.d       a0, a0, t0
+    maskeqz     a0, a0, t1
+    jr          ra
+END(STRRCHR)
+
+libc_hidden_builtin_def(STRRCHR)
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
new file mode 100644
index 0000000000..5a6e22979a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
@@ -0,0 +1,176 @@
+/* Optimized strrchr implementation using LoongArch LASX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+#define STRRCHR __strrchr_lasx
+
+LEAF(STRRCHR, 6)
+    move            a2, a0
+    bstrins.d       a0, zero, 5, 0
+    xvld            xr0, a0, 0
+    xvld            xr1, a0, 32
+
+    li.d            t2, -1
+    xvreplgr2vr.b   xr4, a1
+    xvmsknz.b       xr2, xr0
+    xvmsknz.b       xr3, xr1
+
+    xvpickve.w      xr5, xr2, 4
+    xvpickve.w      xr6, xr3, 4
+    vilvl.h         vr2, vr5, vr2
+    vilvl.h         vr3, vr6, vr3
+
+    vilvl.w         vr2, vr3, vr2
+    movfr2gr.d      t0, fa2
+    sra.d           t0, t0, a2
+    beq             t0, t2, L(find_tail)
+
+
+    xvseq.b         xr2, xr0, xr4
+    xvseq.b         xr3, xr1, xr4
+    xvmsknz.b       xr2, xr2
+    xvmsknz.b       xr3, xr3
+
+    xvpickve.w      xr4, xr2, 4
+    xvpickve.w      xr5, xr3, 4
+    vilvl.h         vr2, vr4, vr2
+    vilvl.h         vr3, vr5, vr3
+
+    vilvl.w         vr1, vr3, vr2
+    slli.d          t3, t2, 1
+    movfr2gr.d      t1, fa1
+    cto.d           t0, t0
+
+    srl.d           t1, t1, a2
+    sll.d           t3, t3, t0
+    addi.d          a0, a2, 63
+    andn            t1, t1, t3
+
+
+    clz.d           t0, t1
+    sub.d           a0, a0, t0
+    maskeqz         a0, a0, t1
+    jr              ra
+
+    .align          5
+L(find_tail):
+    addi.d          a3, a0, 64
+L(loop):
+    xvld            xr2, a0, 64
+    xvld            xr3, a0, 96
+    addi.d          a0, a0, 64
+
+    xvmin.bu        xr5, xr2, xr3
+    xvsetanyeqz.b   fcc0, xr5
+    bceqz           fcc0, L(loop)
+    xvmsknz.b       xr5, xr2
+
+
+    xvmsknz.b       xr6, xr3
+    xvpickve.w      xr7, xr5, 4
+    xvpickve.w      xr8, xr6, 4
+    vilvl.h         vr5, vr7, vr5
+
+    vilvl.h         vr6, vr8, vr6
+    xvseq.b         xr2, xr2, xr4
+    xvseq.b         xr3, xr3, xr4
+    xvmsknz.b       xr2, xr2
+
+    xvmsknz.b       xr3, xr3
+    xvpickve.w      xr7, xr2, 4
+    xvpickve.w      xr8, xr3, 4
+    vilvl.h         vr2, vr7, vr2
+
+    vilvl.h         vr3, vr8, vr3
+    vilvl.w         vr5, vr6, vr5
+    vilvl.w         vr2, vr3, vr2
+    movfr2gr.d      t0, fa5
+
+
+    movfr2gr.d      t1, fa2
+    slli.d          t3, t2, 1
+    cto.d           t0, t0
+    sll.d           t3, t3, t0
+
+    andn            t1, t1, t3
+    beqz            t1, L(find_loop)
+    clz.d           t0, t1
+    addi.d          a0, a0, 63
+
+    sub.d           a0, a0, t0
+    jr              ra
+L(find_loop):
+    beq             a0, a3, L(find_end)
+    xvld            xr2, a0, -64
+
+    xvld            xr3, a0, -32
+    addi.d          a0, a0, -64
+    xvseq.b         xr2, xr2, xr4
+    xvseq.b         xr3, xr3, xr4
+
+
+    xvmax.bu        xr5, xr2, xr3
+    xvseteqz.v      fcc0, xr5
+    bcnez           fcc0, L(find_loop)
+    xvmsknz.b       xr0, xr2
+
+    xvmsknz.b       xr1, xr3
+    xvpickve.w      xr2, xr0, 4
+    xvpickve.w      xr3, xr1, 4
+    vilvl.h         vr0, vr2, vr0
+
+    vilvl.h         vr1, vr3, vr1
+    vilvl.w         vr0, vr1, vr0
+    movfr2gr.d      t0, fa0
+    addi.d          a0, a0, 63
+
+    clz.d           t0, t0
+    sub.d           a0, a0, t0
+    jr              ra
+    nop
+
+
+L(find_end):
+    xvseq.b         xr2, xr0, xr4
+    xvseq.b         xr3, xr1, xr4
+    xvmsknz.b       xr2, xr2
+    xvmsknz.b       xr3, xr3
+
+    xvpickve.w      xr4, xr2, 4
+    xvpickve.w      xr5, xr3, 4
+    vilvl.h         vr2, vr4, vr2
+    vilvl.h         vr3, vr5, vr3
+
+    vilvl.w         vr1, vr3, vr2
+    movfr2gr.d      t1, fa1
+    addi.d          a0, a2, 63
+    srl.d           t1, t1, a2
+
+    clz.d           t0, t1
+    sub.d           a0, a0, t0
+    maskeqz         a0, a0, t1
+    jr              ra
+END(STRRCHR)
+
+libc_hidden_builtin_def(STRRCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
new file mode 100644
index 0000000000..8f2fd22e50
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
@@ -0,0 +1,144 @@
+/* Optimized strrchr implementation using LoongArch LSX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+#define STRRCHR __strrchr_lsx
+
+LEAF(STRRCHR, 6)
+    move            a2, a0
+    bstrins.d       a0, zero, 4, 0
+    vld             vr0, a0, 0
+    vld             vr1, a0, 16
+
+    li.d            t2, -1
+    vreplgr2vr.b    vr4, a1
+    vmsknz.b        vr2, vr0
+    vmsknz.b        vr3, vr1
+
+    vilvl.h         vr2, vr3, vr2
+    movfr2gr.s      t0, fa2
+    sra.w           t0, t0, a2
+    beq             t0, t2, L(find_tail)
+
+    vseq.b          vr2, vr0, vr4
+    vseq.b          vr3, vr1, vr4
+    vmsknz.b        vr2, vr2
+    vmsknz.b        vr3, vr3
+
+
+    vilvl.h         vr1, vr3, vr2
+    slli.d          t3, t2, 1
+    movfr2gr.s      t1, fa1
+    cto.w           t0, t0
+
+    srl.w           t1, t1, a2
+    sll.d           t3, t3, t0
+    addi.d          a0, a2, 31
+    andn            t1, t1, t3
+
+    clz.w           t0, t1
+    sub.d           a0, a0, t0
+    maskeqz         a0, a0, t1
+    jr              ra
+
+    .align          5
+L(find_tail):
+    addi.d          a3, a0, 32
+L(loop):
+    vld             vr2, a0, 32
+    vld             vr3, a0, 48
+    addi.d          a0, a0, 32
+
+    vmin.bu         vr5, vr2, vr3
+    vsetanyeqz.b    fcc0, vr5
+    bceqz           fcc0, L(loop)
+    vmsknz.b        vr5, vr2
+
+    vmsknz.b        vr6, vr3
+    vilvl.h         vr5, vr6, vr5
+    vseq.b          vr2, vr2, vr4
+    vseq.b          vr3, vr3, vr4
+
+    vmsknz.b        vr2, vr2
+    vmsknz.b        vr3, vr3
+    vilvl.h         vr2, vr3, vr2
+    movfr2gr.s      t0, fa5
+
+
+    movfr2gr.s      t1, fa2
+    slli.d          t3, t2, 1
+    cto.w           t0, t0
+    sll.d           t3, t3, t0
+
+    andn            t1, t1, t3
+    beqz            t1, L(find_loop)
+    clz.w           t0, t1
+    addi.d          a0, a0, 31
+
+    sub.d           a0, a0, t0
+    jr              ra
+L(find_loop):
+    beq             a0, a3, L(find_end)
+    vld             vr2, a0, -32
+
+    vld             vr3, a0, -16
+    addi.d          a0, a0, -32
+    vseq.b          vr2, vr2, vr4
+    vseq.b          vr3, vr3, vr4
+
+
+    vmax.bu         vr5, vr2, vr3
+    vseteqz.v       fcc0, vr5
+    bcnez           fcc0, L(find_loop)
+    vmsknz.b        vr0, vr2
+
+    vmsknz.b        vr1, vr3
+    vilvl.h         vr0, vr1, vr0
+    movfr2gr.s      t0, fa0
+    addi.d          a0, a0, 31
+
+    clz.w           t0, t0
+    sub.d           a0, a0, t0
+    jr              ra
+    nop
+
+L(find_end):
+    vseq.b          vr2, vr0, vr4
+    vseq.b          vr3, vr1, vr4
+    vmsknz.b        vr2, vr2
+    vmsknz.b        vr3, vr3
+
+
+    vilvl.h         vr1, vr3, vr2
+    movfr2gr.s      t1, fa1
+    addi.d          a0, a2, 31
+    srl.w           t1, t1, a2
+
+    clz.w           t0, t1
+    sub.d           a0, a0, t0
+    maskeqz         a0, a0, t1
+    jr              ra
+END(STRRCHR)
+
+libc_hidden_builtin_def(STRRCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr.c b/sysdeps/loongarch/lp64/multiarch/strrchr.c
new file mode 100644
index 0000000000..d9c9f660a0
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr.c
@@ -0,0 +1,36 @@
+/* Multiple versions of strrchr.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define strrchr __redirect_strrchr
+# include <string.h>
+# undef strrchr
+
+# define SYMBOL_NAME strrchr
+# include "ifunc-strrchr.h"
+
+libc_ifunc_redirected (__redirect_strrchr, strrchr, IFUNC_SELECTOR ());
+weak_alias (strrchr, rindex)
+# ifdef SHARED
+__hidden_ver1 (strrchr, __GI_strrchr, __redirect_strrchr)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strrchr);
+# endif
+
+#endif
-- 
2.40.0


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v2 3/3] LoongArch: Change to put magic number to .rodata section
  2023-09-13  7:34 [PATCH v2 0/3] LoongArch: Add ifunc support for str{cp, rchr}, dengjianbo
  2023-09-13  7:34 ` [PATCH v2 1/3] LoongArch: Add ifunc support for strcpy, stpcpy{aligned, unaligned, lsx, lasx} dengjianbo
  2023-09-13  7:35 ` [PATCH v2 2/3] LoongArch: Add ifunc support for strrchr{aligned, " dengjianbo
@ 2023-09-13  7:35 ` dengjianbo
  2 siblings, 0 replies; 4+ messages in thread
From: dengjianbo @ 2023-09-13  7:35 UTC (permalink / raw)
  To: libc-alpha
  Cc: adhemerval.zanella, xry111, caiyinyu, xuchenghua, huangpei, dengjianbo

Change to put magic number to .rodata section in memmove-lsx, and use
pcalau12i and %pc_lo12 with vld to get the data.
---
 .../loongarch/lp64/multiarch/memmove-lsx.S    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
index 8a9367708d..5eb819ef74 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
@@ -209,13 +209,10 @@ L(al_less_16):
     nop
 
 
-L(magic_num):
-    .dword          0x0706050403020100
-    .dword          0x0f0e0d0c0b0a0908
 L(unaligned):
-    pcaddi          t2, -4
+    pcalau12i       t2, %pc_hi20(L(INDEX))
     bstrins.d       a1, zero, 3, 0
-    vld             vr8, t2, 0
+    vld             vr8, t2, %pc_lo12(L(INDEX))
     vld             vr0, a1, 0
 
     vld             vr1, a1, 16
@@ -413,13 +410,10 @@ L(back_al_less_16):
     vst             vr1, a0, 0
     jr              ra
 
-L(magic_num_2):
-    .dword          0x0706050403020100
-    .dword          0x0f0e0d0c0b0a0908
 L(back_unaligned):
-    pcaddi          t2, -4
+    pcalau12i       t2, %pc_hi20(L(INDEX))
     bstrins.d       a4, zero, 3, 0
-    vld             vr8, t2, 0
+    vld             vr8, t2, %pc_lo12(L(INDEX))
     vld             vr0, a4, 0
 
     vld             vr1, a4, -16
@@ -529,6 +523,12 @@ L(back_un_less_16):
     jr              ra
 END(MEMMOVE_NAME)
 
+    .section        .rodata.cst16,"M",@progbits,16
+    .align          4
+L(INDEX):
+    .dword          0x0706050403020100
+    .dword          0x0f0e0d0c0b0a0908
+
 libc_hidden_builtin_def (MEMCPY_NAME)
 libc_hidden_builtin_def (MEMMOVE_NAME)
 #endif
-- 
2.40.0


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-09-13  7:35 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-13  7:34 [PATCH v2 0/3] LoongArch: Add ifunc support for str{cp, rchr}, dengjianbo
2023-09-13  7:34 ` [PATCH v2 1/3] LoongArch: Add ifunc support for strcpy, stpcpy{aligned, unaligned, lsx, lasx} dengjianbo
2023-09-13  7:35 ` [PATCH v2 2/3] LoongArch: Add ifunc support for strrchr{aligned, " dengjianbo
2023-09-13  7:35 ` [PATCH v2 3/3] LoongArch: Change to put magic number to .rodata section dengjianbo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).