public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: caiyinyu <caiyinyu@loongson.cn>
To: adhemerval.zanella@linaro.org, libc-alpha@sourceware.org,
	i.swmail@xen0n.name
Cc: joseph_myers@mentor.com, carlos@redhat.com,
	xuchenghua@loongson.cn, caiyinyu <caiyinyu@loongson.cn>
Subject: [PATCH 1/2] LoongArch: Add optimized string functions: str{chr, chrnul, cmp, ncmp}.
Date: Mon, 15 Aug 2022 16:57:17 +0800	[thread overview]
Message-ID: <20220815085718.4110353-2-caiyinyu@loongson.cn> (raw)
In-Reply-To: <20220815085718.4110353-1-caiyinyu@loongson.cn>

---
 sysdeps/loongarch/lp64/strchr.S    | 145 +++++++++++++++
 sysdeps/loongarch/lp64/strchrnul.S | 160 ++++++++++++++++
 sysdeps/loongarch/lp64/strcmp.S    | 210 +++++++++++++++++++++
 sysdeps/loongarch/lp64/strncmp.S   | 281 +++++++++++++++++++++++++++++
 4 files changed, 796 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/strchr.S
 create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
 create mode 100644 sysdeps/loongarch/lp64/strcmp.S
 create mode 100644 sysdeps/loongarch/lp64/strncmp.S

diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
new file mode 100644
index 0000000000..ffe3fbca62
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strchr.S
@@ -0,0 +1,145 @@
+/* Assembly implementation of strchr.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+   +.   use ld.d and mask for the first 8 bytes or less;
+
+   +.   build a1 with 8c with dins;
+
+   +.   use xor from a1 and v0 to check if is found;
+
+   +.   if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
+   one byte is \0, else has no \0
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+#define L_ADDIU	addi.d
+#define L_ADDU	add.d
+#define L_SUBU	sub.d
+
+#define STRCHR  strchr
+#define MOVN(rd,rs,rt) \
+	maskeqz		t6, rs, rt;\
+	masknez		rd, rd, rt;\
+	or		rd, rd, t6
+
+#define MOVN2(rd,rt) \
+	masknez		rd, rd, rt;\
+	or		rd, rd, rt
+
+
+/* char * strchr (const char *s1, int c); */
+
+LEAF(STRCHR)
+	.align		6
+
+	li.w		t4, 0x7
+	lu12i.w		a2, 0x01010
+	bstrins.d	a1, a1, 15, 8
+	andi		t0, a0, 0x7
+
+	ori		a2, a2, 0x101
+	andn		t4, a0, t4
+	slli.w		t1, t0, 3
+
+	ld.d		t4, t4, 0
+
+	nor		t8, zero, zero
+	bstrins.d	a1, a1, 31, 16
+	srl.d		t4, t4, t1
+
+	bstrins.d	a1, a1, 63, 32
+	bstrins.d	a2, a2, 63, 32
+	srl.d		a7, t8, t1
+
+	li.w		t1, 8
+	nor		t8, a7, zero
+	slli.d		a3, a2, 7
+	or		t5, t8, t4
+	and		t3, a7, a1
+
+	sub.w		t1, t1, t0
+	nor		a3, a3, zero
+	xor		t2, t5, t3
+	sub.d		a7, t5, a2
+	nor		a6, t5, a3
+
+	sub.d		a5, t2, a2
+	nor		a4, t2, a3
+
+	and		a6, a7, a6
+	and		a5, a5, a4
+	or		a7, a6, a5
+	bnez		a7, L(_mc8_a)
+
+	L_ADDU		a0, a0, t1
+L(_aloop):
+	ld.d		t4, a0, 0
+
+	xor		t2, t4, a1
+	sub.d		a7, t4, a2
+	nor		a6, t4, a3
+	sub.d		a5, t2, a2
+
+	nor		a4, t2, a3
+	and		a6, a7, a6
+	and		a5, a5, a4
+	or		a7, a6, a5
+	bnez		a7, L(_mc8_a)
+
+	ld.d		t4, a0, 8
+	L_ADDIU	 a0, a0, 16
+	xor		t2, t4, a1
+	sub.d		a7, t4, a2
+	nor		a6, t4, a3
+	sub.d		a5, t2, a2
+
+	nor		a4, t2, a3
+	and		a6, a7, a6
+	and		a5, a5, a4
+	or		a7, a6, a5
+	beqz		a7, L(_aloop)
+
+	L_ADDIU		a0, a0, -8
+L(_mc8_a):
+
+	ctz.d		t0, a5
+	ctz.d		t2, a6
+
+	srli.w		t0, t0, 3
+	srli.w		t2, t2, 3
+	sltu		t1, t2, t0
+	L_ADDU		v0, a0, t0
+	masknez		v0, v0, t1
+	jr		ra
+END(STRCHR)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strchr)
+weak_alias (strchr, index)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
new file mode 100644
index 0000000000..dcbfded765
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strchrnul.S
@@ -0,0 +1,160 @@
+/* Assembly implementation of strchrnul.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+    +.  use ld.d and mask for the first 8 bytes or less;
+
+    +.  build a1 with 8c with dins;
+
+    +.  use xor from a1 and v0 to check if is found;
+
+    +.  if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
+    one byte is \0, else has no \0
+
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+
+#define L_ADDIU	addi.d
+#define L_ADDU	add.d
+#define L_SUBU	sub.d
+
+#define STRCHRNUL   __strchrnul
+
+#define MOVN(rd,rs,rt) \
+	maskeqz		t6, rs, rt;\
+	masknez		rd, rd, rt;\
+	or		rd, rd, t6
+
+#define MOVZ(rd,rs,rt) \
+	masknez		t6, rs, rt;\
+	maskeqz		rd, rd, rt;\
+	or		rd, rd, t6
+
+
+#define MOVN2(rd,rt) \
+	masknez		rd, rd, rt;\
+	or		rd, rd, rt
+
+/* char * strchrnul (const char *s1, int c); */
+
+LEAF(STRCHRNUL)
+	.align		6
+
+	li.w		t4, 0x7
+	lu12i.w		a2, 0x01010
+	bstrins.d	a1, a1, 15, 8
+	andi		t0, a0, 0x7
+
+	ori		a2, a2, 0x101
+	andn		t4, a0, t4
+	slli.w		t1, t0, 3
+
+	ld.d		t4, t4, 0
+
+	nor		t8, zero, zero
+	bstrins.d	a1, a1, 31, 16
+	srl.d		t4, t4, t1
+
+	preld		0,  a0, 32
+	bstrins.d	a1, a1, 63, 32
+	bstrins.d	a2, a2, 63, 32
+	srl.d		a7, t8, t1
+
+	nor		t8, a7, zero
+	slli.d		a3, a2, 7
+	or		t5, t8, t4
+	and		t3, a7, a1
+
+	nor		a3, a3, zero
+	xor		t2, t5, t3
+	sub.d		a7, t5, a2
+	nor		a6, t5, a3
+
+	li.w		t1, 8
+	sub.d		a5, t2, a2
+	nor		a4, t2, a3
+
+	and		a6, a7, a6
+	and		a5, a5, a4
+	or		a7, a6, a5
+	bnez		a7, L(_mc8_a)
+
+
+	sub.w		t1, t1, t0
+	L_ADDU		a0, a0, t1
+L(_aloop):
+	ld.d		t4, a0, 0
+
+	xor		t2, t4, a1
+	sub.d		a7, t4, a2
+	nor		a6, t4, a3
+	sub.d		a5, t2, a2
+
+	nor		a4, t2, a3
+	and		a6, a7, a6
+	and		a5, a5, a4
+
+	or		a7, a6, a5
+	bnez		a7, L(_mc8_a)
+
+	ld.d		t4, a0, 8
+	L_ADDIU		a0, a0, 16
+
+	xor		t2, t4, a1
+	sub.d		a7, t4, a2
+	nor		a6, t4, a3
+	sub.d		a5, t2, a2
+
+	nor		a4, t2, a3
+	and		a6, a7, a6
+	and		a5, a5, a4
+
+	or		a7, a6, a5
+	beqz		a7, L(_aloop)
+
+	L_ADDIU		a0, a0, -8
+L(_mc8_a):
+
+	ctz.d		t0, a5
+	ctz.d		t2, a6
+
+	srli.w		t0, t0, 3
+	srli.w		t2, t2, 3
+	slt		t1, t0, t2
+
+	MOVZ(t0,t2,t1)
+
+	L_ADDU		v0, a0, t0
+	jr		ra
+END(STRCHRNUL)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+weak_alias(__strchrnul, strchrnul)
+libc_hidden_builtin_def (__strchrnul)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
new file mode 100644
index 0000000000..8160ae432d
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strcmp.S
@@ -0,0 +1,210 @@
+/* Assembly implementation of strcmp.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+   +.  let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
+   set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
+
+   +.  if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more
+   ld other times;
+
+   +.  if not,  load partial t2 and t3, check if t2 has \0;
+
+   +.  then use use ld for t0, ldr for t1,
+
+   +.  if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with
+   8 byte from t0 with a mask in a7
+
+   +.  if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from
+   t0
+
+   +.  if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+   one byte is \0, else has no \0
+
+   +.  for partial 8 byte from ldr t3, 0(a0), preload t3 with
+   0xffffffffffffffff
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRCMP  strcmp
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define src1	a0
+#define src2	a1
+#define result	v0
+/* Note: v0 = a0 in lp64 ABI */
+
+
+/* Internal variable */
+#define data1		t0
+#define data2		t1
+#define has_nul		t2
+#define diff		t3
+#define syndrome	t4
+#define zeroones	t5
+#define sevenf		t6
+#define pos		t7
+#define exchange	t8
+#define tmp1		a4
+#define tmp2		a5
+#define tmp3		a6
+#define src1_off	a2
+#define src2_off	a3
+#define tmp4		a7
+
+/* rd <- if rc then ra else rb will destroy tmp3 */
+
+#define CONDITIONSEL(rd,rc,ra,rb)\
+	masknez		tmp3, rb, rc;\
+	maskeqz		rd,   ra, rc;\
+	or		rd,   rd, tmp3
+
+
+
+/* int strcmp (const char *s1, const char *s2); */
+
+LEAF(STRCMP)
+	.align		4
+
+	xor		tmp1, src1, src2
+	lu12i.w		zeroones, 0x01010
+	lu12i.w		sevenf, 0x7f7f7
+	andi		src1_off, src1, 0x7
+	ori		zeroones, zeroones, 0x101
+	ori		sevenf, sevenf, 0xf7f
+	andi		tmp1, tmp1, 0x7
+	bstrins.d	zeroones, zeroones, 63, 32
+	bstrins.d	sevenf, sevenf, 63, 32
+	bnez		tmp1, strcmp_misaligned8
+	bnez		src1_off, strcmp_mutual_align
+strcmp_loop_aligned:
+	ld.d		data1, src1, 0
+	addi.d		src1, src1, 8
+	ld.d		data2, src2, 0
+	addi.d		src2, src2, 8
+strcmp_start_realigned:
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	or		syndrome, diff, has_nul
+	beqz		syndrome, strcmp_loop_aligned
+
+strcmp_end:
+	ctz.d		pos, syndrome
+	bstrins.d	pos, zero, 2, 0
+	srl.d		data1, data1, pos
+	srl.d		data2, data2, pos
+	andi		data1, data1, 0xff
+	andi		data2, data2, 0xff
+	sub.d		result, data1, data2
+	jr		ra
+strcmp_mutual_align:
+	bstrins.d	src1, zero, 2, 0
+	bstrins.d	src2, zero, 2, 0
+	slli.d		tmp1, src1_off,  0x3
+	ld.d		data1, src1, 0
+	sub.d		tmp1, zero, tmp1
+	ld.d		data2, src2, 0
+	addi.d		src1, src1, 8
+	addi.d		src2, src2, 8
+	nor		tmp2, zero, zero
+	srl.d		tmp2, tmp2, tmp1
+	or		data1, data1, tmp2
+	or		data2, data2, tmp2
+	b		strcmp_start_realigned
+
+strcmp_misaligned8:
+
+/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2)))
+   then exchange(src1,src2)
+*/
+	andi		src2_off, src2, 0x7
+	slt		tmp2, src1_off, src2_off
+	CONDITIONSEL(tmp2,src2_off,tmp2,tmp1)
+	maskeqz		exchange, tmp2, src1_off
+	xor		tmp3, src1, src2
+	maskeqz		tmp3, tmp3, exchange
+	xor		src1, src1, tmp3
+	xor		src2, src2, tmp3
+
+	andi		src1_off, src1, 0x7
+	beqz		src1_off, strcmp_loop_misaligned
+strcmp_do_misaligned:
+	ld.bu		data1, src1, 0
+	ld.bu		data2, src2, 0
+	xor		tmp3, data1, data2
+	addi.d		src1, src1, 1
+	masknez		tmp3, data1, tmp3
+	addi.d		src2, src2, 1
+	beqz		tmp3, strcmp_done
+	andi		src1_off, src1, 0x7
+	bnez		src1_off, strcmp_do_misaligned
+
+strcmp_loop_misaligned:
+	andi		tmp1, src2, 0xff8
+	xori		tmp1, tmp1, 0xff8
+	beqz		tmp1, strcmp_do_misaligned
+	ld.d		data1, src1, 0
+	ld.d		data2, src2, 0
+	addi.d		src1, src1, 8
+	addi.d		src2, src2, 8
+
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	or		syndrome, diff, has_nul
+	beqz		syndrome, strcmp_loop_misaligned
+
+strcmp_misalign_end:
+	ctz.d		pos, syndrome
+	bstrins.d	pos, zero, 2, 0
+	srl.d		data1, data1, pos
+	srl.d		data2, data2, pos
+	andi		data1, data1, 0xff
+	andi		data2, data2, 0xff
+	sub.d		tmp1, data1, data2
+	sub.d		tmp2, data2, data1
+	CONDITIONSEL(result,exchange,tmp2,tmp1)
+	jr		ra
+
+strcmp_done:
+	sub.d		tmp1, data1, data2
+	sub.d		tmp2, data2, data1
+	CONDITIONSEL(result,exchange,tmp2,tmp1)
+	jr		ra
+END(STRCMP)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strcmp)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
new file mode 100644
index 0000000000..a72b280170
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strncmp.S
@@ -0,0 +1,281 @@
+/* Assembly implementation of strncmp.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+   +.   let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
+   set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
+
+   +.   if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more
+   ld other times;
+
+   +.   if not,  load partial t2 and t3, check if t2 has \0;
+
+   +.   then use use ld for t0, ldr for t1,
+
+   +.   if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with
+   8 byte from t0 with a mask in a7
+
+   +.   if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from
+   t0
+
+   +.   if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+   one byte is \0, else has no \0
+
+   +.   for partial 8 byte from ldr t3, 0(a0), preload t3 with
+   0xffffffffffffffff
+
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRNCMP strncmp
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define src1	a0
+#define src2	a1
+#define limit	a2
+/* Note: v0 = a0 in lp64 ABI */
+#define result	v0
+
+
+/* Internal variable */
+#define data1		t0
+#define data2		t1
+#define has_nul		t2
+#define diff		t3
+#define syndrome	t4
+#define zeroones	t5
+#define sevenf		t6
+#define pos		t7
+#define exchange	t8
+#define tmp1		a5
+#define tmp2		a6
+#define tmp3		a7
+#define src1_off	a3
+#define limit_wd	a4
+
+
+/* int strncmp (const char *s1, const char *s2); */
+
+LEAF(STRNCMP)
+	.align		4
+	beqz		limit, strncmp_ret0
+
+	xor		tmp1, src1, src2
+	lu12i.w		zeroones, 0x01010
+	lu12i.w		sevenf, 0x7f7f7
+	andi		src1_off, src1, 0x7
+	ori		zeroones, zeroones, 0x101
+	andi		tmp1, tmp1, 0x7
+	ori		sevenf, sevenf, 0xf7f
+	bstrins.d	zeroones, zeroones, 63, 32
+	bstrins.d	sevenf, sevenf, 63, 32
+	bnez		tmp1, strncmp_misaligned8
+	bnez		src1_off, strncmp_mutual_align
+
+	addi.d		limit_wd, limit, -1
+	srli.d		limit_wd, limit_wd, 3
+
+strncmp_loop_aligned:
+	ld.d		data1, src1, 0
+	addi.d		src1, src1, 8
+	ld.d		data2, src2, 0
+	addi.d		src2, src2, 8
+
+strncmp_start_realigned:
+	addi.d		limit_wd, limit_wd, -1
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	srli.d		tmp1, limit_wd, 63
+	or		syndrome, diff, has_nul
+	or		tmp2, syndrome, tmp1
+	beqz		tmp2, strncmp_loop_aligned
+
+	/* if not reach limit */
+	bge		limit_wd, zero, strncmp_not_limit
+	/* if reach limit */
+	andi		limit, limit, 0x7
+	li.w		tmp1, 0x8
+	sub.d		limit, tmp1, limit
+	slli.d		limit, limit, 0x3
+	li.d		tmp1, -1
+	srl.d		tmp1, tmp1, limit
+	and		data1, data1, tmp1
+	and		data2, data2, tmp1
+	orn		syndrome, syndrome, tmp1
+
+
+strncmp_not_limit:
+	ctz.d		pos, syndrome
+	bstrins.d	pos, zero, 2, 0
+	srl.d		data1, data1, pos
+	srl.d		data2, data2, pos
+	andi		data1, data1, 0xff
+	andi		data2, data2, 0xff
+	sub.d		result, data1, data2
+	jr		ra
+
+
+
+strncmp_mutual_align:
+	bstrins.d	src1, zero, 2, 0
+	bstrins.d	src2, zero, 2, 0
+	slli.d		tmp1, src1_off,  0x3
+	ld.d		data1, src1, 0
+	ld.d		data2, src2, 0
+	addi.d		src2, src2, 8
+	addi.d		src1, src1, 8
+
+	addi.d		limit_wd, limit, -1
+	andi		tmp3, limit_wd, 0x7
+	srli.d		limit_wd, limit_wd, 3
+	add.d		limit, limit, src1_off
+	add.d		tmp3, tmp3, src1_off
+	srli.d		tmp3, tmp3, 3
+	add.d		limit_wd, limit_wd, tmp3
+
+	sub.d		tmp1, zero, tmp1
+	nor		tmp2, zero, zero
+	srl.d		tmp2, tmp2, tmp1
+	or		data1, data1, tmp2
+	or		data2, data2, tmp2
+	b		strncmp_start_realigned
+
+strncmp_misaligned8:
+	li.w		tmp1, 0x10
+	bge		limit, tmp1, strncmp_try_words
+
+strncmp_byte_loop:
+	ld.bu		data1, src1, 0
+	ld.bu		data2, src2, 0
+	addi.d		limit, limit, -1
+	xor		tmp1, data1, data2
+	masknez		tmp1, data1, tmp1
+	maskeqz		tmp1, limit, tmp1
+	beqz		tmp1, strncmp_done
+
+	ld.bu		data1, src1, 1
+	ld.bu		data2, src2, 1
+	addi.d		src1, src1, 2
+	addi.d		src2, src2, 2
+	addi.d		limit, limit, -1
+	xor		tmp1, data1, data2
+	masknez		tmp1, data1, tmp1
+	maskeqz		tmp1, limit, tmp1
+	bnez		tmp1, strncmp_byte_loop
+
+
+strncmp_done:
+	sub.d		result, data1, data2
+	jr		ra
+
+strncmp_try_words:
+	srli.d		limit_wd, limit, 3
+	beqz		src1_off, strncmp_do_misaligned
+
+	sub.d		src1_off, zero, src1_off
+	andi		src1_off, src1_off, 0x7
+	sub.d		limit, limit, src1_off
+	srli.d		limit_wd, limit, 0x3
+
+
+strncmp_page_end_loop:
+	ld.bu		data1, src1, 0
+	ld.bu		data2, src2, 0
+	addi.d		src1, src1, 1
+	addi.d		src2, src2, 1
+	xor		tmp1, data1, data2
+	masknez		tmp1, data1, tmp1
+	beqz		tmp1, strncmp_done
+	andi		tmp1, src1, 0x7
+	bnez		tmp1, strncmp_page_end_loop
+strncmp_do_misaligned:
+	li.w		src1_off, 0x8
+	addi.d		limit_wd, limit_wd, -1
+	blt		limit_wd, zero, strncmp_done_loop
+
+strncmp_loop_misaligned:
+	andi		tmp2, src2, 0xff8
+	xori		tmp2, tmp2, 0xff8
+	beqz		tmp2, strncmp_page_end_loop
+
+	ld.d		data1, src1, 0
+	ld.d		data2, src2, 0
+	addi.d		src1, src1, 8
+	addi.d		src2, src2, 8
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	or		syndrome, diff, has_nul
+	bnez		syndrome, strncmp_not_limit
+	addi.d		limit_wd, limit_wd, -1
+	bge		limit_wd, zero, strncmp_loop_misaligned
+
+strncmp_done_loop:
+	andi		limit, limit, 0x7
+	beqz		limit, strncmp_not_limit
+	/* Read the last double word */
+	/* check if the final part is about to exceed the page */
+	andi		tmp1, src2, 0x7
+	andi		tmp2, src2, 0xff8
+	add.d		tmp1, tmp1, limit
+	xori		tmp2, tmp2, 0xff8
+	andi		tmp1, tmp1, 0x8
+	masknez		tmp1, tmp1, tmp2
+	bnez		tmp1, strncmp_byte_loop
+	addi.d		src1, src1, -8
+	addi.d		src2, src2, -8
+	ldx.d		data1, src1, limit
+	ldx.d		data2, src2, limit
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	or		syndrome, diff, has_nul
+	bnez		syndrome, strncmp_not_limit
+
+strncmp_ret0:
+	move		result, zero
+	jr		ra
+
+/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2)))
+   then exchange(src1,src2)
+ */
+
+
+END(STRNCMP)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strncmp)
+#endif
+#endif
-- 
2.31.1


  reply	other threads:[~2022-08-15  8:57 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-15  8:57 [PATCH 0/2] LoongArch: Add optimized functions caiyinyu
2022-08-15  8:57 ` caiyinyu [this message]
2022-08-15  8:57 ` [PATCH 2/2] LoongArch: Add optimized function: memmove caiyinyu
2022-08-15 14:02 ` [PATCH 0/2] LoongArch: Add optimized functions Carlos O'Donell
2022-08-15 20:46   ` Joseph Myers
     [not found]     ` <ccc3c93d-07d0-ea9b-562c-aeaec8914f20@loongson.cn>
2022-09-02  9:05       ` Fwd: " dengjianbo
2022-09-02 12:27     ` Adhemerval Zanella Netto
     [not found]       ` <403f78f0-55d9-48cf-c62a-4a0462a76987@loongson.cn>
2022-09-19  2:03         ` dengjianbo
2022-09-19 20:16           ` Adhemerval Zanella Netto
2022-09-20  9:54             ` Xi Ruoyao
2022-09-22 18:05               ` Adhemerval Zanella Netto
2022-09-26 13:49                 ` Xi Ruoyao
2022-09-28 14:22                   ` Richard Henderson
2022-09-28 16:42                     ` Xi Ruoyao
2022-09-28 19:18                       ` Richard Henderson
2022-10-10  1:39                         ` Lulu Cheng
2022-09-29  3:00                       ` Lulu Cheng
2022-09-29 11:45                   ` Adhemerval Zanella Netto

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220815085718.4110353-2-caiyinyu@loongson.cn \
    --to=caiyinyu@loongson.cn \
    --cc=adhemerval.zanella@linaro.org \
    --cc=carlos@redhat.com \
    --cc=i.swmail@xen0n.name \
    --cc=joseph_myers@mentor.com \
    --cc=libc-alpha@sourceware.org \
    --cc=xuchenghua@loongson.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).