public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc] LoongArch: Add ifunc support for memset{aligned, unaligned, lsx, lasx}
@ 2023-08-29  4:33 Yinyu Cai
  0 siblings, 0 replies; only message in thread
From: Yinyu Cai @ 2023-08-29  4:33 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=1b1e9b7c10f27947a7cddddf95701ec8030167a6

commit 1b1e9b7c10f27947a7cddddf95701ec8030167a6
Author: dengjianbo <dengjianbo@loongson.cn>
Date:   Mon Aug 28 10:08:38 2023 +0800

    LoongArch: Add ifunc support for memset{aligned, unaligned, lsx, lasx}
    
    According to glibc memset microbenchmark test results, for LSX and LASX
    versions, A few cases with length less than 8 experience performace
    degradation, overall, the LASX version could reduce the runtime about
    15% - 75%, LSX version could reduce the runtime about 15%-50%.
    
    The unaligned version uses unaligned memmory access to set data which
    length is less than 64 and make address aligned with 8. For this part,
    the performace is better than aligned version. Comparing with the generic
    version, the performance is close when the length is larger than 128. When
    the length is 8-128, the unaligned version could reduce the runtime about
    30%-70%, the aligned version could reduce the runtime about 20%-50%.

Diff:
---
 sysdeps/loongarch/lp64/multiarch/Makefile          |   4 +
 .../lp64/multiarch/dl-symbol-redir-ifunc.h         |  24 +++
 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c |  10 ++
 sysdeps/loongarch/lp64/multiarch/memset-aligned.S  | 174 +++++++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/memset-lasx.S     | 142 +++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/memset-lsx.S      | 135 ++++++++++++++++
 .../loongarch/lp64/multiarch/memset-unaligned.S    | 162 +++++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/memset.c          |  37 +++++
 8 files changed, 688 insertions(+)

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 7b87bc9055..216886c551 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -30,5 +30,9 @@ sysdep_routines += \
   memrchr-generic \
   memrchr-lsx \
   memrchr-lasx \
+  memset-aligned \
+  memset-unaligned \
+  memset-lsx \
+  memset-lasx \
 # sysdep_routines
 endif
diff --git a/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
new file mode 100644
index 0000000000..e2723873bc
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
@@ -0,0 +1,24 @@
+/* Symbol rediretion for loader/static initialization code.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_IFUNC_GENERIC_H
+#define _DL_IFUNC_GENERIC_H
+
+asm ("memset = __memset_aligned");
+
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 8bd5489ee2..37f60dde91 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -117,5 +117,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
 	      )
+
+  IFUNC_IMPL (i, name, memset,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx)
+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
+	      )
+
   return i;
 }
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
new file mode 100644
index 0000000000..1fce95b714
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
@@ -0,0 +1,174 @@
+/* Optimized memset aligned implementation using basic LoongArch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define MEMSET_NAME __memset_aligned
+#else
+# define MEMSET_NAME memset
+#endif
+
+LEAF(MEMSET_NAME, 6)
+    move        t0, a0
+    andi        a3, a0, 0x7
+    li.w        t6, 16
+    beqz        a3, L(align)
+    bltu        a2, t6, L(short_data)
+
+L(make_align):
+    li.w        t8, 8
+    sub.d       t2, t8, a3
+    pcaddi      t1, 11
+    slli.d      t3, t2, 2
+    sub.d       t1, t1, t3
+    jr          t1
+
+L(al7):
+    st.b        a1, t0, 6
+L(al6):
+    st.b        a1, t0, 5
+L(al5):
+    st.b        a1, t0, 4
+L(al4):
+    st.b        a1, t0, 3
+L(al3):
+    st.b        a1, t0, 2
+L(al2):
+    st.b        a1, t0, 1
+L(al1):
+    st.b        a1, t0, 0
+L(al0):
+    add.d       t0, t0, t2
+    sub.d       a2, a2, t2
+
+L(align):
+    bstrins.d   a1, a1, 15, 8
+    bstrins.d   a1, a1, 31, 16
+    bstrins.d   a1, a1, 63, 32
+    bltu        a2, t6, L(less_16bytes)
+
+    andi        a4, a2, 0x3f
+    beq         a4, a2, L(less_64bytes)
+
+    sub.d       t1, a2, a4
+    move        a2, a4
+    add.d       a5, t0, t1
+
+L(loop_64bytes):
+    addi.d      t0, t0, 64
+    st.d        a1, t0, -64
+    st.d        a1, t0, -56
+    st.d        a1, t0, -48
+    st.d        a1, t0, -40
+
+    st.d        a1, t0, -32
+    st.d        a1, t0, -24
+    st.d        a1, t0, -16
+    st.d        a1, t0, -8
+    bne         t0, a5, L(loop_64bytes)
+
+L(less_64bytes):
+    srai.d      a4, a2, 5
+    beqz        a4, L(less_32bytes)
+    addi.d      a2, a2, -32
+    st.d        a1, t0, 0
+
+    st.d        a1, t0, 8
+    st.d        a1, t0, 16
+    st.d        a1, t0, 24
+    addi.d      t0, t0, 32
+
+L(less_32bytes):
+    bltu        a2, t6, L(less_16bytes)
+    addi.d      a2, a2, -16
+    st.d        a1, t0, 0
+    st.d        a1, t0, 8
+    addi.d      t0, t0, 16
+
+L(less_16bytes):
+    srai.d      a4, a2, 3
+    beqz        a4, L(less_8bytes)
+    addi.d      a2, a2, -8
+    st.d        a1, t0, 0
+    addi.d      t0, t0, 8
+
+L(less_8bytes):
+    beqz        a2, L(less_1byte)
+    srai.d      a4, a2, 2
+    beqz        a4, L(less_4bytes)
+    addi.d      a2, a2, -4
+    st.w        a1, t0, 0
+    addi.d      t0, t0, 4
+
+L(less_4bytes):
+    srai.d      a3, a2, 1
+    beqz        a3, L(less_2bytes)
+    addi.d      a2, a2, -2
+    st.h        a1, t0, 0
+    addi.d      t0, t0, 2
+
+L(less_2bytes):
+    beqz        a2, L(less_1byte)
+    st.b        a1, t0, 0
+L(less_1byte):
+    jr          ra
+
+L(short_data):
+    pcaddi      t1, 19
+    slli.d      t3, a2, 2
+    sub.d       t1, t1, t3
+    jr          t1
+L(short_15):
+    st.b        a1, a0, 14
+L(short_14):
+    st.b        a1, a0, 13
+L(short_13):
+    st.b        a1, a0, 12
+L(short_12):
+    st.b        a1, a0, 11
+L(short_11):
+    st.b        a1, a0, 10
+L(short_10):
+    st.b        a1, a0, 9
+L(short_9):
+    st.b        a1, a0, 8
+L(short_8):
+    st.b        a1, a0, 7
+L(short_7):
+    st.b        a1, a0, 6
+L(short_6):
+    st.b        a1, a0, 5
+L(short_5):
+    st.b        a1, a0, 4
+L(short_4):
+    st.b        a1, a0, 3
+L(short_3):
+    st.b        a1, a0, 2
+L(short_2):
+    st.b        a1, a0, 1
+L(short_1):
+    st.b        a1, a0, 0
+L(short_0):
+    jr          ra
+END(MEMSET_NAME)
+
+libc_hidden_builtin_def (MEMSET_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
new file mode 100644
index 0000000000..041abbac87
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
@@ -0,0 +1,142 @@
+/* Optimized memset implementation using LoongArch LASX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMSET __memset_lasx
+
+LEAF(MEMSET, 6)
+    li.d            t1, 32
+    move            a3, a0
+    xvreplgr2vr.b   xr0, a1
+    add.d           a4, a0, a2
+
+    bgeu            t1, a2, L(less_32bytes)
+    li.d            t3, 128
+    li.d            t2, 64
+    blt             t3, a2, L(long_bytes)
+
+L(less_128bytes):
+    bgeu            t2, a2, L(less_64bytes)
+    xvst            xr0, a3, 0
+    xvst            xr0, a3, 32
+    xvst            xr0, a4, -32
+
+    xvst            xr0, a4, -64
+    jr              ra
+L(less_64bytes):
+    xvst            xr0, a3, 0
+    xvst            xr0, a4, -32
+
+
+    jr              ra
+L(less_32bytes):
+    srli.d          t0, a2, 4
+    beqz            t0, L(less_16bytes)
+    vst             vr0, a3, 0
+
+    vst             vr0, a4, -16
+    jr              ra
+L(less_16bytes):
+    srli.d          t0, a2, 3
+    beqz            t0, L(less_8bytes)
+
+    vstelm.d        vr0, a3, 0, 0
+    vstelm.d        vr0, a4, -8, 0
+    jr              ra
+L(less_8bytes):
+    srli.d          t0, a2, 2
+
+    beqz            t0, L(less_4bytes)
+    vstelm.w        vr0, a3, 0, 0
+    vstelm.w        vr0, a4, -4, 0
+    jr              ra
+
+
+L(less_4bytes):
+    srli.d          t0, a2, 1
+    beqz            t0, L(less_2bytes)
+    vstelm.h        vr0, a3, 0, 0
+    vstelm.h        vr0, a4, -2, 0
+
+    jr              ra
+L(less_2bytes):
+    beqz            a2, L(less_1bytes)
+    st.b            a1, a3, 0
+L(less_1bytes):
+    jr              ra
+
+L(long_bytes):
+    xvst            xr0, a3, 0
+    bstrins.d       a3, zero, 4, 0
+    addi.d          a3, a3, 32
+    sub.d           a2, a4, a3
+
+    andi            t0, a2, 0xff
+    beq             t0, a2, L(long_end)
+    move            a2, t0
+    sub.d           t0, a4, t0
+
+
+L(loop_256):
+    xvst            xr0, a3, 0
+    xvst            xr0, a3, 32
+    xvst            xr0, a3, 64
+    xvst            xr0, a3, 96
+
+    xvst            xr0, a3, 128
+    xvst            xr0, a3, 160
+    xvst            xr0, a3, 192
+    xvst            xr0, a3, 224
+
+    addi.d          a3, a3, 256
+    bne             a3, t0, L(loop_256)
+L(long_end):
+    bltu            a2, t3, L(end_less_128)
+    addi.d          a2, a2, -128
+
+    xvst            xr0, a3, 0
+    xvst            xr0, a3, 32
+    xvst            xr0, a3, 64
+    xvst            xr0, a3, 96
+
+
+    addi.d          a3, a3, 128
+L(end_less_128):
+    bltu            a2, t2, L(end_less_64)
+    addi.d          a2, a2, -64
+    xvst            xr0, a3, 0
+
+    xvst            xr0, a3, 32
+    addi.d          a3, a3, 64
+L(end_less_64):
+    bltu            a2, t1, L(end_less_32)
+    xvst            xr0, a3, 0
+
+L(end_less_32):
+    xvst            xr0, a4, -32
+    jr              ra
+END(MEMSET)
+
+libc_hidden_builtin_def (MEMSET)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
new file mode 100644
index 0000000000..3d3982aa5a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
@@ -0,0 +1,135 @@
+/* Optimized memset implementation using LoongArch LSX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMSET __memset_lsx
+
+LEAF(MEMSET, 6)
+    li.d            t1, 16
+    move            a3, a0
+    vreplgr2vr.b    vr0, a1
+    add.d           a4, a0, a2
+
+    bgeu            t1, a2, L(less_16bytes)
+    li.d            t3, 64
+    li.d            t2, 32
+    bgeu            a2, t3, L(long_bytes)
+
+L(less_64bytes):
+    bgeu            t2, a2, L(less_32bytes)
+    vst             vr0, a3, 0
+    vst             vr0, a3, 16
+    vst             vr0, a4, -32
+
+    vst             vr0, a4, -16
+    jr              ra
+L(less_32bytes):
+    vst             vr0, a3, 0
+    vst             vr0, a4, -16
+
+
+    jr              ra
+L(less_16bytes):
+    srli.d          t0, a2, 3
+    beqz            t0, L(less_8bytes)
+    vstelm.d        vr0, a3, 0, 0
+
+    vstelm.d        vr0, a4, -8, 0
+    jr              ra
+L(less_8bytes):
+    srli.d          t0, a2, 2
+    beqz            t0, L(less_4bytes)
+
+    vstelm.w        vr0, a3, 0, 0
+    vstelm.w        vr0, a4, -4, 0
+    jr              ra
+L(less_4bytes):
+    srli.d          t0, a2, 1
+
+    beqz            t0, L(less_2bytes)
+    vstelm.h        vr0, a3, 0, 0
+    vstelm.h        vr0, a4, -2, 0
+    jr              ra
+
+
+L(less_2bytes):
+    beqz            a2, L(less_1bytes)
+    vstelm.b        vr0, a3, 0, 0
+L(less_1bytes):
+    jr              ra
+L(long_bytes):
+    vst             vr0, a3, 0
+
+    bstrins.d       a3, zero, 3, 0
+    addi.d          a3, a3, 16
+    sub.d           a2, a4, a3
+    andi            t0, a2, 0x7f
+
+    beq             t0, a2, L(long_end)
+    move            a2, t0
+    sub.d           t0, a4, t0
+
+L(loop_128):
+    vst             vr0, a3, 0
+
+    vst             vr0, a3, 16
+    vst             vr0, a3, 32
+    vst             vr0, a3, 48
+    vst             vr0, a3, 64
+
+
+    vst             vr0, a3, 80
+    vst             vr0, a3, 96
+    vst             vr0, a3, 112
+    addi.d          a3, a3, 128
+
+    bne             a3, t0, L(loop_128)
+L(long_end):
+    bltu            a2, t3, L(end_less_64)
+    addi.d          a2, a2, -64
+    vst             vr0, a3, 0
+
+    vst             vr0, a3, 16
+    vst             vr0, a3, 32
+    vst             vr0, a3, 48
+    addi.d          a3, a3, 64
+
+L(end_less_64):
+    bltu            a2, t2, L(end_less_32)
+    addi.d          a2, a2, -32
+    vst             vr0, a3, 0
+    vst             vr0, a3, 16
+
+    addi.d          a3, a3, 32
+L(end_less_32):
+    bltu            a2, t1, L(end_less_16)
+    vst             vr0, a3, 0
+
+L(end_less_16):
+    vst             vr0, a4, -16
+    jr              ra
+END(MEMSET)
+
+libc_hidden_builtin_def (MEMSET)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
new file mode 100644
index 0000000000..f7d32039df
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
@@ -0,0 +1,162 @@
+/* Optimized memset unaligned implementation using basic LoongArch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+
+# define MEMSET_NAME __memset_unaligned
+
+#define ST_128(n)              \
+    st.d        a1, a0, n;     \
+    st.d        a1, a0, n+8  ; \
+    st.d        a1, a0, n+16 ; \
+    st.d        a1, a0, n+24 ; \
+    st.d        a1, a0, n+32 ; \
+    st.d        a1, a0, n+40 ; \
+    st.d        a1, a0, n+48 ; \
+    st.d        a1, a0, n+56 ; \
+    st.d        a1, a0, n+64 ; \
+    st.d        a1, a0, n+72 ; \
+    st.d        a1, a0, n+80 ; \
+    st.d        a1, a0, n+88 ; \
+    st.d        a1, a0, n+96 ; \
+    st.d        a1, a0, n+104; \
+    st.d        a1, a0, n+112; \
+    st.d        a1, a0, n+120;
+
+LEAF(MEMSET_NAME, 6)
+    bstrins.d   a1, a1, 15, 8
+    add.d       t7, a0, a2
+    bstrins.d   a1, a1, 31, 16
+    move        t0, a0
+
+    bstrins.d   a1, a1, 63, 32
+    srai.d      t8, a2, 4
+    beqz        t8, L(less_16bytes)
+    srai.d      t8, a2, 6
+
+    bnez        t8, L(more_64bytes)
+    srai.d      t8, a2, 5
+    beqz        t8, L(less_32bytes)
+
+    st.d        a1, a0, 0
+    st.d        a1, a0, 8
+    st.d        a1, a0, 16
+    st.d        a1, a0, 24
+
+    st.d        a1, t7, -32
+    st.d        a1, t7, -24
+    st.d        a1, t7, -16
+    st.d        a1, t7, -8
+
+    jr          ra
+
+L(less_32bytes):
+    st.d        a1, a0, 0
+    st.d        a1, a0, 8
+    st.d        a1, t7, -16
+    st.d        a1, t7, -8
+
+    jr          ra
+
+L(less_16bytes):
+    srai.d      t8, a2, 3
+    beqz        t8, L(less_8bytes)
+    st.d        a1, a0, 0
+    st.d        a1, t7, -8
+
+    jr          ra
+
+L(less_8bytes):
+    srai.d      t8, a2, 2
+    beqz        t8, L(less_4bytes)
+    st.w        a1, a0, 0
+    st.w        a1, t7, -4
+
+    jr          ra
+
+L(less_4bytes):
+    srai.d      t8, a2, 1
+    beqz        t8, L(less_2bytes)
+    st.h        a1, a0, 0
+    st.h        a1, t7, -2
+
+    jr          ra
+
+L(less_2bytes):
+    beqz        a2, L(less_1bytes)
+    st.b        a1, a0, 0
+
+    jr          ra
+
+L(less_1bytes):
+    jr          ra
+
+L(more_64bytes):
+    srli.d      a0, a0, 3
+    slli.d      a0, a0, 3
+    addi.d      a0, a0, 0x8
+    st.d        a1, t0, 0
+
+    sub.d       t2, t0, a0
+    add.d       a2, t2, a2
+    addi.d      a2, a2, -0x80
+    blt         a2, zero, L(end_unalign_proc)
+
+L(loop_less):
+    ST_128(0)
+    addi.d      a0, a0,  0x80
+    addi.d      a2, a2, -0x80
+    bge         a2, zero, L(loop_less)
+
+L(end_unalign_proc):
+    addi.d      a2, a2, 0x80
+    pcaddi      t1, 20
+    andi        t5, a2, 0x78
+    srli.d      t5, t5, 1
+
+    sub.d       t1, t1, t5
+    jr          t1
+
+    st.d        a1, a0, 112
+    st.d        a1, a0, 104
+    st.d        a1, a0, 96
+    st.d        a1, a0, 88
+    st.d        a1, a0, 80
+    st.d        a1, a0, 72
+    st.d        a1, a0, 64
+    st.d        a1, a0, 56
+    st.d        a1, a0, 48
+    st.d        a1, a0, 40
+    st.d        a1, a0, 32
+    st.d        a1, a0, 24
+    st.d        a1, a0, 16
+    st.d        a1, a0, 8
+    st.d        a1, a0, 0
+    st.d        a1, t7, -8
+
+    move        a0, t0
+    jr          ra
+END(MEMSET_NAME)
+
+libc_hidden_builtin_def (MEMSET_NAME)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset.c b/sysdeps/loongarch/lp64/multiarch/memset.c
new file mode 100644
index 0000000000..3ff60d8ac7
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset.c
@@ -0,0 +1,37 @@
+/* Multiple versions of memset.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define memset __redirect_memset
+# include <string.h>
+# undef memset
+
+# define SYMBOL_NAME memset
+# include "ifunc-lasx.h"
+
+libc_ifunc_redirected (__redirect_memset, memset,
+		       IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (memset, __GI_memset, __redirect_memset)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memset);
+# endif
+
+#endif

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-08-29  4:33 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-29  4:33 [glibc] LoongArch: Add ifunc support for memset{aligned, unaligned, lsx, lasx} Yinyu Cai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).