public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 0/2] LoongArch: Add optimized functions.
@ 2022-08-15  8:57 caiyinyu
  2022-08-15  8:57 ` [PATCH 1/2] LoongArch: Add optimized string functions: str{chr, chrnul, cmp, ncmp} caiyinyu
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: caiyinyu @ 2022-08-15  8:57 UTC (permalink / raw)
  To: adhemerval.zanella, libc-alpha, i.swmail
  Cc: joseph_myers, carlos, xuchenghua, caiyinyu

Tested on LoongArch machine: gcc 13.0.0, Linux kernel 5.19.0 rc2,
binutils branch master 2eb132bdfb9.

make check:
XPASS: conform/UNIX98/ndbm.h/linknamespace
XPASS: conform/XOPEN2K/ndbm.h/linknamespace
XPASS: conform/XOPEN2K8/ndbm.h/linknamespace
XPASS: conform/XPG42/ndbm.h/linknamespace
UNSUPPORTED: crypt/cert
UNSUPPORTED: elf/tst-env-setuid
UNSUPPORTED: elf/tst-env-setuid-tunables
XPASS: elf/tst-protected1a
XPASS: elf/tst-protected1b
UNSUPPORTED: elf/tst-valgrind-smoke
UNSUPPORTED: misc/tst-adjtimex
UNSUPPORTED: misc/tst-clock_adjtime
UNSUPPORTED: misc/tst-ntp_adjtime
UNSUPPORTED: misc/tst-pkey
UNSUPPORTED: misc/tst-rseq
UNSUPPORTED: misc/tst-rseq-disable
UNSUPPORTED: nptl/test-cond-printers
UNSUPPORTED: nptl/test-condattr-printers
UNSUPPORTED: nptl/test-mutex-printers
UNSUPPORTED: nptl/test-mutexattr-printers
UNSUPPORTED: nptl/test-rwlock-printers
UNSUPPORTED: nptl/test-rwlockattr-printers
UNSUPPORTED: nptl/tst-rseq-nptl
UNSUPPORTED: stdlib/tst-secure-getenv
UNSUPPORTED: time/tst-clock_settime
UNSUPPORTED: time/tst-settimeofday
Summary of test results:
   4581 PASS
     20 UNSUPPORTED
     12 XFAIL
      6 XPASS

make bench:
  https://github.com/oy456xd/glibc_larch_opt/tree/main/image

caiyinyu (2):
  LoongArch: Add optimized string functions: str{chr,chrnul,cmp,ncmp}.
  LoongArch: Add optimized function: memmove.

 sysdeps/loongarch/lp64/memmove.S   | 491 +++++++++++++++++++++++++++++
 sysdeps/loongarch/lp64/strchr.S    | 145 +++++++++
 sysdeps/loongarch/lp64/strchrnul.S | 160 ++++++++++
 sysdeps/loongarch/lp64/strcmp.S    | 210 ++++++++++++
 sysdeps/loongarch/lp64/strncmp.S   | 281 +++++++++++++++++
 5 files changed, 1287 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/memmove.S
 create mode 100644 sysdeps/loongarch/lp64/strchr.S
 create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
 create mode 100644 sysdeps/loongarch/lp64/strcmp.S
 create mode 100644 sysdeps/loongarch/lp64/strncmp.S

-- 
2.31.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2] LoongArch: Add optimized string functions: str{chr, chrnul, cmp, ncmp}.
  2022-08-15  8:57 [PATCH 0/2] LoongArch: Add optimized functions caiyinyu
@ 2022-08-15  8:57 ` caiyinyu
  2022-08-15  8:57 ` [PATCH 2/2] LoongArch: Add optimized function: memmove caiyinyu
  2022-08-15 14:02 ` [PATCH 0/2] LoongArch: Add optimized functions Carlos O'Donell
  2 siblings, 0 replies; 16+ messages in thread
From: caiyinyu @ 2022-08-15  8:57 UTC (permalink / raw)
  To: adhemerval.zanella, libc-alpha, i.swmail
  Cc: joseph_myers, carlos, xuchenghua, caiyinyu

---
 sysdeps/loongarch/lp64/strchr.S    | 145 +++++++++++++++
 sysdeps/loongarch/lp64/strchrnul.S | 160 ++++++++++++++++
 sysdeps/loongarch/lp64/strcmp.S    | 210 +++++++++++++++++++++
 sysdeps/loongarch/lp64/strncmp.S   | 281 +++++++++++++++++++++++++++++
 4 files changed, 796 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/strchr.S
 create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
 create mode 100644 sysdeps/loongarch/lp64/strcmp.S
 create mode 100644 sysdeps/loongarch/lp64/strncmp.S

diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
new file mode 100644
index 0000000000..ffe3fbca62
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strchr.S
@@ -0,0 +1,145 @@
+/* Assembly implementation of strchr.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+   +.   use ld.d and mask for the first 8 bytes or less;
+
+   +.   build a1 with 8c with dins;
+
+   +.   use xor from a1 and v0 to check if is found;
+
+   +.   if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
+   one byte is \0, else has no \0
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+#define L_ADDIU	addi.d
+#define L_ADDU	add.d
+#define L_SUBU	sub.d
+
+#define STRCHR  strchr
+#define MOVN(rd,rs,rt) \
+	maskeqz		t6, rs, rt;\
+	masknez		rd, rd, rt;\
+	or		rd, rd, t6
+
+#define MOVN2(rd,rt) \
+	masknez		rd, rd, rt;\
+	or		rd, rd, rt
+
+
+/* char * strchr (const char *s1, int c); */
+
+LEAF(STRCHR)
+	.align		6
+
+	li.w		t4, 0x7
+	lu12i.w		a2, 0x01010
+	bstrins.d	a1, a1, 15, 8
+	andi		t0, a0, 0x7
+
+	ori		a2, a2, 0x101
+	andn		t4, a0, t4
+	slli.w		t1, t0, 3
+
+	ld.d		t4, t4, 0
+
+	nor		t8, zero, zero
+	bstrins.d	a1, a1, 31, 16
+	srl.d		t4, t4, t1
+
+	bstrins.d	a1, a1, 63, 32
+	bstrins.d	a2, a2, 63, 32
+	srl.d		a7, t8, t1
+
+	li.w		t1, 8
+	nor		t8, a7, zero
+	slli.d		a3, a2, 7
+	or		t5, t8, t4
+	and		t3, a7, a1
+
+	sub.w		t1, t1, t0
+	nor		a3, a3, zero
+	xor		t2, t5, t3
+	sub.d		a7, t5, a2
+	nor		a6, t5, a3
+
+	sub.d		a5, t2, a2
+	nor		a4, t2, a3
+
+	and		a6, a7, a6
+	and		a5, a5, a4
+	or		a7, a6, a5
+	bnez		a7, L(_mc8_a)
+
+	L_ADDU		a0, a0, t1
+L(_aloop):
+	ld.d		t4, a0, 0
+
+	xor		t2, t4, a1
+	sub.d		a7, t4, a2
+	nor		a6, t4, a3
+	sub.d		a5, t2, a2
+
+	nor		a4, t2, a3
+	and		a6, a7, a6
+	and		a5, a5, a4
+	or		a7, a6, a5
+	bnez		a7, L(_mc8_a)
+
+	ld.d		t4, a0, 8
+	L_ADDIU	 a0, a0, 16
+	xor		t2, t4, a1
+	sub.d		a7, t4, a2
+	nor		a6, t4, a3
+	sub.d		a5, t2, a2
+
+	nor		a4, t2, a3
+	and		a6, a7, a6
+	and		a5, a5, a4
+	or		a7, a6, a5
+	beqz		a7, L(_aloop)
+
+	L_ADDIU		a0, a0, -8
+L(_mc8_a):
+
+	ctz.d		t0, a5
+	ctz.d		t2, a6
+
+	srli.w		t0, t0, 3
+	srli.w		t2, t2, 3
+	sltu		t1, t2, t0
+	L_ADDU		v0, a0, t0
+	masknez		v0, v0, t1
+	jr		ra
+END(STRCHR)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strchr)
+weak_alias (strchr, index)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
new file mode 100644
index 0000000000..dcbfded765
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strchrnul.S
@@ -0,0 +1,160 @@
+/* Assembly implementation of strchrnul.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+    +.  use ld.d and mask for the first 8 bytes or less;
+
+    +.  build a1 with 8c with dins;
+
+    +.  use xor from a1 and v0 to check if is found;
+
+    +.  if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
+    one byte is \0, else has no \0
+
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+
+#define L_ADDIU	addi.d
+#define L_ADDU	add.d
+#define L_SUBU	sub.d
+
+#define STRCHRNUL   __strchrnul
+
+#define MOVN(rd,rs,rt) \
+	maskeqz		t6, rs, rt;\
+	masknez		rd, rd, rt;\
+	or		rd, rd, t6
+
+#define MOVZ(rd,rs,rt) \
+	masknez		t6, rs, rt;\
+	maskeqz		rd, rd, rt;\
+	or		rd, rd, t6
+
+
+#define MOVN2(rd,rt) \
+	masknez		rd, rd, rt;\
+	or		rd, rd, rt
+
+/* char * strchrnul (const char *s1, int c); */
+
+LEAF(STRCHRNUL)
+	.align		6
+
+	li.w		t4, 0x7
+	lu12i.w		a2, 0x01010
+	bstrins.d	a1, a1, 15, 8
+	andi		t0, a0, 0x7
+
+	ori		a2, a2, 0x101
+	andn		t4, a0, t4
+	slli.w		t1, t0, 3
+
+	ld.d		t4, t4, 0
+
+	nor		t8, zero, zero
+	bstrins.d	a1, a1, 31, 16
+	srl.d		t4, t4, t1
+
+	preld		0,  a0, 32
+	bstrins.d	a1, a1, 63, 32
+	bstrins.d	a2, a2, 63, 32
+	srl.d		a7, t8, t1
+
+	nor		t8, a7, zero
+	slli.d		a3, a2, 7
+	or		t5, t8, t4
+	and		t3, a7, a1
+
+	nor		a3, a3, zero
+	xor		t2, t5, t3
+	sub.d		a7, t5, a2
+	nor		a6, t5, a3
+
+	li.w		t1, 8
+	sub.d		a5, t2, a2
+	nor		a4, t2, a3
+
+	and		a6, a7, a6
+	and		a5, a5, a4
+	or		a7, a6, a5
+	bnez		a7, L(_mc8_a)
+
+
+	sub.w		t1, t1, t0
+	L_ADDU		a0, a0, t1
+L(_aloop):
+	ld.d		t4, a0, 0
+
+	xor		t2, t4, a1
+	sub.d		a7, t4, a2
+	nor		a6, t4, a3
+	sub.d		a5, t2, a2
+
+	nor		a4, t2, a3
+	and		a6, a7, a6
+	and		a5, a5, a4
+
+	or		a7, a6, a5
+	bnez		a7, L(_mc8_a)
+
+	ld.d		t4, a0, 8
+	L_ADDIU		a0, a0, 16
+
+	xor		t2, t4, a1
+	sub.d		a7, t4, a2
+	nor		a6, t4, a3
+	sub.d		a5, t2, a2
+
+	nor		a4, t2, a3
+	and		a6, a7, a6
+	and		a5, a5, a4
+
+	or		a7, a6, a5
+	beqz		a7, L(_aloop)
+
+	L_ADDIU		a0, a0, -8
+L(_mc8_a):
+
+	ctz.d		t0, a5
+	ctz.d		t2, a6
+
+	srli.w		t0, t0, 3
+	srli.w		t2, t2, 3
+	slt		t1, t0, t2
+
+	MOVZ(t0,t2,t1)
+
+	L_ADDU		v0, a0, t0
+	jr		ra
+END(STRCHRNUL)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+weak_alias(__strchrnul, strchrnul)
+libc_hidden_builtin_def (__strchrnul)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
new file mode 100644
index 0000000000..8160ae432d
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strcmp.S
@@ -0,0 +1,210 @@
+/* Assembly implementation of strcmp.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+   +.  let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
+   set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
+
+   +.  if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more
+   ld other times;
+
+   +.  if not,  load partial t2 and t3, check if t2 has \0;
+
+   +.  then use use ld for t0, ldr for t1,
+
+   +.  if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with
+   8 byte from t0 with a mask in a7
+
+   +.  if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from
+   t0
+
+   +.  if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+   one byte is \0, else has no \0
+
+   +.  for partial 8 byte from ldr t3, 0(a0), preload t3 with
+   0xffffffffffffffff
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRCMP  strcmp
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define src1	a0
+#define src2	a1
+#define result	v0
+/* Note: v0 = a0 in lp64 ABI */
+
+
+/* Internal variable */
+#define data1		t0
+#define data2		t1
+#define has_nul		t2
+#define diff		t3
+#define syndrome	t4
+#define zeroones	t5
+#define sevenf		t6
+#define pos		t7
+#define exchange	t8
+#define tmp1		a4
+#define tmp2		a5
+#define tmp3		a6
+#define src1_off	a2
+#define src2_off	a3
+#define tmp4		a7
+
+/* rd <- if rc then ra else rb will destroy tmp3 */
+
+#define CONDITIONSEL(rd,rc,ra,rb)\
+	masknez		tmp3, rb, rc;\
+	maskeqz		rd,   ra, rc;\
+	or		rd,   rd, tmp3
+
+
+
+/* int strcmp (const char *s1, const char *s2); */
+
+LEAF(STRCMP)
+	.align		4
+
+	xor		tmp1, src1, src2
+	lu12i.w		zeroones, 0x01010
+	lu12i.w		sevenf, 0x7f7f7
+	andi		src1_off, src1, 0x7
+	ori		zeroones, zeroones, 0x101
+	ori		sevenf, sevenf, 0xf7f
+	andi		tmp1, tmp1, 0x7
+	bstrins.d	zeroones, zeroones, 63, 32
+	bstrins.d	sevenf, sevenf, 63, 32
+	bnez		tmp1, strcmp_misaligned8
+	bnez		src1_off, strcmp_mutual_align
+strcmp_loop_aligned:
+	ld.d		data1, src1, 0
+	addi.d		src1, src1, 8
+	ld.d		data2, src2, 0
+	addi.d		src2, src2, 8
+strcmp_start_realigned:
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	or		syndrome, diff, has_nul
+	beqz		syndrome, strcmp_loop_aligned
+
+strcmp_end:
+	ctz.d		pos, syndrome
+	bstrins.d	pos, zero, 2, 0
+	srl.d		data1, data1, pos
+	srl.d		data2, data2, pos
+	andi		data1, data1, 0xff
+	andi		data2, data2, 0xff
+	sub.d		result, data1, data2
+	jr		ra
+strcmp_mutual_align:
+	bstrins.d	src1, zero, 2, 0
+	bstrins.d	src2, zero, 2, 0
+	slli.d		tmp1, src1_off,  0x3
+	ld.d		data1, src1, 0
+	sub.d		tmp1, zero, tmp1
+	ld.d		data2, src2, 0
+	addi.d		src1, src1, 8
+	addi.d		src2, src2, 8
+	nor		tmp2, zero, zero
+	srl.d		tmp2, tmp2, tmp1
+	or		data1, data1, tmp2
+	or		data2, data2, tmp2
+	b		strcmp_start_realigned
+
+strcmp_misaligned8:
+
+/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2)))
+   then exchange(src1,src2)
+*/
+	andi		src2_off, src2, 0x7
+	slt		tmp2, src1_off, src2_off
+	CONDITIONSEL(tmp2,src2_off,tmp2,tmp1)
+	maskeqz		exchange, tmp2, src1_off
+	xor		tmp3, src1, src2
+	maskeqz		tmp3, tmp3, exchange
+	xor		src1, src1, tmp3
+	xor		src2, src2, tmp3
+
+	andi		src1_off, src1, 0x7
+	beqz		src1_off, strcmp_loop_misaligned
+strcmp_do_misaligned:
+	ld.bu		data1, src1, 0
+	ld.bu		data2, src2, 0
+	xor		tmp3, data1, data2
+	addi.d		src1, src1, 1
+	masknez		tmp3, data1, tmp3
+	addi.d		src2, src2, 1
+	beqz		tmp3, strcmp_done
+	andi		src1_off, src1, 0x7
+	bnez		src1_off, strcmp_do_misaligned
+
+strcmp_loop_misaligned:
+	andi		tmp1, src2, 0xff8
+	xori		tmp1, tmp1, 0xff8
+	beqz		tmp1, strcmp_do_misaligned
+	ld.d		data1, src1, 0
+	ld.d		data2, src2, 0
+	addi.d		src1, src1, 8
+	addi.d		src2, src2, 8
+
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	or		syndrome, diff, has_nul
+	beqz		syndrome, strcmp_loop_misaligned
+
+strcmp_misalign_end:
+	ctz.d		pos, syndrome
+	bstrins.d	pos, zero, 2, 0
+	srl.d		data1, data1, pos
+	srl.d		data2, data2, pos
+	andi		data1, data1, 0xff
+	andi		data2, data2, 0xff
+	sub.d		tmp1, data1, data2
+	sub.d		tmp2, data2, data1
+	CONDITIONSEL(result,exchange,tmp2,tmp1)
+	jr		ra
+
+strcmp_done:
+	sub.d		tmp1, data1, data2
+	sub.d		tmp2, data2, data1
+	CONDITIONSEL(result,exchange,tmp2,tmp1)
+	jr		ra
+END(STRCMP)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strcmp)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
new file mode 100644
index 0000000000..a72b280170
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strncmp.S
@@ -0,0 +1,281 @@
+/* Assembly implementation of strncmp.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+   +.   let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
+   set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
+
+   +.   if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more
+   ld other times;
+
+   +.   if not,  load partial t2 and t3, check if t2 has \0;
+
+   +.   then use use ld for t0, ldr for t1,
+
+   +.   if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with
+   8 byte from t0 with a mask in a7
+
+   +.   if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from
+   t0
+
+   +.   if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+   one byte is \0, else has no \0
+
+   +.   for partial 8 byte from ldr t3, 0(a0), preload t3 with
+   0xffffffffffffffff
+
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRNCMP strncmp
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define src1	a0
+#define src2	a1
+#define limit	a2
+/* Note: v0 = a0 in lp64 ABI */
+#define result	v0
+
+
+/* Internal variable */
+#define data1		t0
+#define data2		t1
+#define has_nul		t2
+#define diff		t3
+#define syndrome	t4
+#define zeroones	t5
+#define sevenf		t6
+#define pos		t7
+#define exchange	t8
+#define tmp1		a5
+#define tmp2		a6
+#define tmp3		a7
+#define src1_off	a3
+#define limit_wd	a4
+
+
+/* int strncmp (const char *s1, const char *s2); */
+
+LEAF(STRNCMP)
+	.align		4
+	beqz		limit, strncmp_ret0
+
+	xor		tmp1, src1, src2
+	lu12i.w		zeroones, 0x01010
+	lu12i.w		sevenf, 0x7f7f7
+	andi		src1_off, src1, 0x7
+	ori		zeroones, zeroones, 0x101
+	andi		tmp1, tmp1, 0x7
+	ori		sevenf, sevenf, 0xf7f
+	bstrins.d	zeroones, zeroones, 63, 32
+	bstrins.d	sevenf, sevenf, 63, 32
+	bnez		tmp1, strncmp_misaligned8
+	bnez		src1_off, strncmp_mutual_align
+
+	addi.d		limit_wd, limit, -1
+	srli.d		limit_wd, limit_wd, 3
+
+strncmp_loop_aligned:
+	ld.d		data1, src1, 0
+	addi.d		src1, src1, 8
+	ld.d		data2, src2, 0
+	addi.d		src2, src2, 8
+
+strncmp_start_realigned:
+	addi.d		limit_wd, limit_wd, -1
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	srli.d		tmp1, limit_wd, 63
+	or		syndrome, diff, has_nul
+	or		tmp2, syndrome, tmp1
+	beqz		tmp2, strncmp_loop_aligned
+
+	/* if not reach limit */
+	bge		limit_wd, zero, strncmp_not_limit
+	/* if reach limit */
+	andi		limit, limit, 0x7
+	li.w		tmp1, 0x8
+	sub.d		limit, tmp1, limit
+	slli.d		limit, limit, 0x3
+	li.d		tmp1, -1
+	srl.d		tmp1, tmp1, limit
+	and		data1, data1, tmp1
+	and		data2, data2, tmp1
+	orn		syndrome, syndrome, tmp1
+
+
+strncmp_not_limit:
+	ctz.d		pos, syndrome
+	bstrins.d	pos, zero, 2, 0
+	srl.d		data1, data1, pos
+	srl.d		data2, data2, pos
+	andi		data1, data1, 0xff
+	andi		data2, data2, 0xff
+	sub.d		result, data1, data2
+	jr		ra
+
+
+
+strncmp_mutual_align:
+	bstrins.d	src1, zero, 2, 0
+	bstrins.d	src2, zero, 2, 0
+	slli.d		tmp1, src1_off,  0x3
+	ld.d		data1, src1, 0
+	ld.d		data2, src2, 0
+	addi.d		src2, src2, 8
+	addi.d		src1, src1, 8
+
+	addi.d		limit_wd, limit, -1
+	andi		tmp3, limit_wd, 0x7
+	srli.d		limit_wd, limit_wd, 3
+	add.d		limit, limit, src1_off
+	add.d		tmp3, tmp3, src1_off
+	srli.d		tmp3, tmp3, 3
+	add.d		limit_wd, limit_wd, tmp3
+
+	sub.d		tmp1, zero, tmp1
+	nor		tmp2, zero, zero
+	srl.d		tmp2, tmp2, tmp1
+	or		data1, data1, tmp2
+	or		data2, data2, tmp2
+	b		strncmp_start_realigned
+
+strncmp_misaligned8:
+	li.w		tmp1, 0x10
+	bge		limit, tmp1, strncmp_try_words
+
+strncmp_byte_loop:
+	ld.bu		data1, src1, 0
+	ld.bu		data2, src2, 0
+	addi.d		limit, limit, -1
+	xor		tmp1, data1, data2
+	masknez		tmp1, data1, tmp1
+	maskeqz		tmp1, limit, tmp1
+	beqz		tmp1, strncmp_done
+
+	ld.bu		data1, src1, 1
+	ld.bu		data2, src2, 1
+	addi.d		src1, src1, 2
+	addi.d		src2, src2, 2
+	addi.d		limit, limit, -1
+	xor		tmp1, data1, data2
+	masknez		tmp1, data1, tmp1
+	maskeqz		tmp1, limit, tmp1
+	bnez		tmp1, strncmp_byte_loop
+
+
+strncmp_done:
+	sub.d		result, data1, data2
+	jr		ra
+
+strncmp_try_words:
+	srli.d		limit_wd, limit, 3
+	beqz		src1_off, strncmp_do_misaligned
+
+	sub.d		src1_off, zero, src1_off
+	andi		src1_off, src1_off, 0x7
+	sub.d		limit, limit, src1_off
+	srli.d		limit_wd, limit, 0x3
+
+
+strncmp_page_end_loop:
+	ld.bu		data1, src1, 0
+	ld.bu		data2, src2, 0
+	addi.d		src1, src1, 1
+	addi.d		src2, src2, 1
+	xor		tmp1, data1, data2
+	masknez		tmp1, data1, tmp1
+	beqz		tmp1, strncmp_done
+	andi		tmp1, src1, 0x7
+	bnez		tmp1, strncmp_page_end_loop
+strncmp_do_misaligned:
+	li.w		src1_off, 0x8
+	addi.d		limit_wd, limit_wd, -1
+	blt		limit_wd, zero, strncmp_done_loop
+
+strncmp_loop_misaligned:
+	andi		tmp2, src2, 0xff8
+	xori		tmp2, tmp2, 0xff8
+	beqz		tmp2, strncmp_page_end_loop
+
+	ld.d		data1, src1, 0
+	ld.d		data2, src2, 0
+	addi.d		src1, src1, 8
+	addi.d		src2, src2, 8
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	or		syndrome, diff, has_nul
+	bnez		syndrome, strncmp_not_limit
+	addi.d		limit_wd, limit_wd, -1
+	bge		limit_wd, zero, strncmp_loop_misaligned
+
+strncmp_done_loop:
+	andi		limit, limit, 0x7
+	beqz		limit, strncmp_not_limit
+	/* Read the last double word */
+	/* check if the final part is about to exceed the page */
+	andi		tmp1, src2, 0x7
+	andi		tmp2, src2, 0xff8
+	add.d		tmp1, tmp1, limit
+	xori		tmp2, tmp2, 0xff8
+	andi		tmp1, tmp1, 0x8
+	masknez		tmp1, tmp1, tmp2
+	bnez		tmp1, strncmp_byte_loop
+	addi.d		src1, src1, -8
+	addi.d		src2, src2, -8
+	ldx.d		data1, src1, limit
+	ldx.d		data2, src2, limit
+	sub.d		tmp1, data1, zeroones
+	or		tmp2, data1, sevenf
+	xor		diff, data1, data2
+	andn		has_nul, tmp1, tmp2
+	or		syndrome, diff, has_nul
+	bnez		syndrome, strncmp_not_limit
+
+strncmp_ret0:
+	move		result, zero
+	jr		ra
+
+/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2)))
+   then exchange(src1,src2)
+ */
+
+
+END(STRNCMP)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strncmp)
+#endif
+#endif
-- 
2.31.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/2] LoongArch: Add optimized function: memmove.
  2022-08-15  8:57 [PATCH 0/2] LoongArch: Add optimized functions caiyinyu
  2022-08-15  8:57 ` [PATCH 1/2] LoongArch: Add optimized string functions: str{chr, chrnul, cmp, ncmp} caiyinyu
@ 2022-08-15  8:57 ` caiyinyu
  2022-08-15 14:02 ` [PATCH 0/2] LoongArch: Add optimized functions Carlos O'Donell
  2 siblings, 0 replies; 16+ messages in thread
From: caiyinyu @ 2022-08-15  8:57 UTC (permalink / raw)
  To: adhemerval.zanella, libc-alpha, i.swmail
  Cc: joseph_myers, carlos, xuchenghua, caiyinyu

---
 sysdeps/loongarch/lp64/memmove.S | 491 +++++++++++++++++++++++++++++++
 1 file changed, 491 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/memmove.S

diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
new file mode 100644
index 0000000000..632820404e
--- /dev/null
+++ b/sysdeps/loongarch/lp64/memmove.S
@@ -0,0 +1,491 @@
+/* Assembly implementation of memmove.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifdef _LIBC
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+#else
+#include <regdef.h>
+#include <sys/asm.h>
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMMOVE_NAME
+#define MEMMOVE_NAME memmove
+#endif
+
+#define LD_64(reg, n)		\
+	ld.d	t0, reg, n;	\
+	ld.d	t1, reg, n+8;	\
+	ld.d	t2, reg, n+16;	\
+	ld.d	t3, reg, n+24;	\
+	ld.d	t4, reg, n+32;	\
+	ld.d	t5, reg, n+40;	\
+	ld.d	t6, reg, n+48;	\
+	ld.d	t7, reg, n+56;
+
+
+#define ST_64(reg, n)		\
+	st.d	t0, reg, n;	\
+	st.d	t1, reg, n+8;	\
+	st.d	t2, reg, n+16;	\
+	st.d	t3, reg, n+24;	\
+	st.d	t4, reg, n+32;	\
+	st.d	t5, reg, n+40;	\
+	st.d	t6, reg, n+48;	\
+	st.d	t7, reg, n+56;
+
+#define LDST_1024		\
+	LD_64(a1, 0);		\
+	ST_64(a0, 0);		\
+	LD_64(a1, 64);		\
+	ST_64(a0, 64);		\
+	LD_64(a1, 128);		\
+	ST_64(a0, 128);		\
+	LD_64(a1, 192);		\
+	ST_64(a0, 192);		\
+	LD_64(a1, 256);		\
+	ST_64(a0, 256);		\
+	LD_64(a1, 320);		\
+	ST_64(a0, 320);		\
+	LD_64(a1, 384);		\
+	ST_64(a0, 384);		\
+	LD_64(a1, 448);		\
+	ST_64(a0, 448);		\
+	LD_64(a1, 512);		\
+	ST_64(a0, 512);		\
+	LD_64(a1, 576);		\
+	ST_64(a0, 576);		\
+	LD_64(a1, 640);		\
+	ST_64(a0, 640);		\
+	LD_64(a1, 704);		\
+	ST_64(a0, 704);		\
+	LD_64(a1, 768);		\
+	ST_64(a0, 768);		\
+	LD_64(a1, 832);		\
+	ST_64(a0, 832);		\
+	LD_64(a1, 896);		\
+	ST_64(a0, 896);		\
+	LD_64(a1, 960);		\
+	ST_64(a0, 960);
+
+#define LDST_1024_BACK		\
+	LD_64(a4, -64);		\
+	ST_64(a3, -64);		\
+	LD_64(a4, -128);	\
+	ST_64(a3, -128);	\
+	LD_64(a4, -192);	\
+	ST_64(a3, -192);	\
+	LD_64(a4, -256);	\
+	ST_64(a3, -256);	\
+	LD_64(a4, -320);	\
+	ST_64(a3, -320);	\
+	LD_64(a4, -384);	\
+	ST_64(a3, -384);	\
+	LD_64(a4, -448);	\
+	ST_64(a3, -448);	\
+	LD_64(a4, -512);	\
+	ST_64(a3, -512);	\
+	LD_64(a4, -576);	\
+	ST_64(a3, -576);	\
+	LD_64(a4, -640);	\
+	ST_64(a3, -640);	\
+	LD_64(a4, -704);	\
+	ST_64(a3, -704);	\
+	LD_64(a4, -768);	\
+	ST_64(a3, -768);	\
+	LD_64(a4, -832);	\
+	ST_64(a3, -832);	\
+	LD_64(a4, -896);	\
+	ST_64(a3, -896);	\
+	LD_64(a4, -960);	\
+	ST_64(a3, -960);	\
+	LD_64(a4, -1024);	\
+	ST_64(a3, -1024);
+
+#ifdef ANDROID_CHANGES
+LEAF(MEMMOVE_NAME, 0)
+#else
+LEAF(MEMMOVE_NAME)
+#endif
+
+/* 1st var: dest ptr: void *str1 $r4 a0 */
+/* 2nd var: src  ptr: void *str2 $r5 a1 */
+/* 3rd var: size_t num */
+/* t0~t9 registers as temp */
+
+	add.d		a4, a1, a2
+	add.d		a3, a0, a2
+	beq		a1, a0, less_1bytes
+	move		t8, a0
+	srai.d		a6, a2, 4		#num/16
+	beqz		a6, less_16bytes	#num<16
+	srai.d		a6, a2, 6		#num/64
+	bnez		a6, more_64bytes	#num>64
+	srai.d		a6, a2, 5
+	beqz		a6, less_32bytes	#num<32
+
+	ld.d		t0, a1, 0		#32<num<64
+	ld.d		t1, a1, 8
+	ld.d		t2, a1, 16
+	ld.d		t3, a1, 24
+	ld.d		t4, a4, -32
+	ld.d		t5, a4, -24
+	ld.d		t6, a4, -16
+	ld.d		t7, a4, -8
+	st.d		t0, a0, 0
+	st.d		t1, a0, 8
+	st.d		t2, a0, 16
+	st.d		t3, a0, 24
+	st.d		t4, a3, -32
+	st.d		t5, a3, -24
+	st.d		t6, a3, -16
+	st.d		t7, a3, -8
+
+	jr		ra
+
+less_32bytes:
+	ld.d		t0, a1, 0
+	ld.d		t1, a1, 8
+	ld.d		t2, a4, -16
+	ld.d		t3, a4, -8
+	st.d		t0, a0, 0
+	st.d		t1, a0, 8
+	st.d		t2, a3, -16
+	st.d		t3, a3, -8
+
+	jr		ra
+
+less_16bytes:
+	srai.d		a6, a2, 3		#num/8
+	beqz		a6, less_8bytes
+
+	ld.d		t0, a1, 0
+	ld.d		t1, a4, -8
+	st.d		t0, a0, 0
+	st.d		t1, a3, -8
+
+	jr		ra
+
+less_8bytes:
+	srai.d		a6, a2, 2
+	beqz		a6, less_4bytes
+
+	ld.w		t0, a1, 0
+	ld.w		t1, a4, -4
+	st.w		t0, a0, 0
+	st.w		t1, a3, -4
+
+	jr		ra
+
+less_4bytes:
+	srai.d		a6, a2, 1
+	beqz		a6, less_2bytes
+
+	ld.h		t0, a1, 0
+	ld.h		t1, a4, -2
+	st.h		t0, a0, 0
+	st.h		t1, a3, -2
+
+	jr		ra
+
+less_2bytes:
+	beqz		a2, less_1bytes
+
+	ld.b		t0, a1, 0
+	st.b		t0, a0, 0
+
+	jr		ra
+
+less_1bytes:
+	jr		ra
+
+more_64bytes:
+	sub.d		a7, a0, a1
+	bltu		a7, a2, copy_backward
+
+copy_forward:
+	srli.d		a0, a0, 3
+	slli.d		a0, a0, 3
+	beq		a0, t8, all_align
+	addi.d		a0, a0, 0x8
+	sub.d		a7, t8, a0
+	sub.d		a1, a1, a7
+	add.d		a2, a7, a2
+
+start_unalign_proc:
+	pcaddi		t1, 18
+	slli.d		a6, a7, 3
+	add.d		t1, t1, a6
+	jirl		zero, t1, 0
+
+start_7_unalign:
+	ld.b		t0, a1, -7
+	st.b		t0, a0, -7
+start_6_unalign:
+	ld.b		t0, a1, -6
+	st.b		t0, a0, -6
+start_5_unalign:
+	ld.b		t0, a1, -5
+	st.b		t0, a0, -5
+start_4_unalign:
+	ld.b		t0, a1, -4
+	st.b		t0, a0, -4
+start_3_unalign:
+	ld.b		t0, a1, -3
+	st.b		t0, a0, -3
+start_2_unalign:
+	ld.b		t0, a1, -2
+	st.b		t0, a0, -2
+start_1_unalign:
+	ld.b		t0, a1, -1
+	st.b		t0, a0, -1
+
+start_over:
+	addi.d		a2, a2, -0x80
+	blt		a2, zero, end_unalign_proc
+
+loop_less:
+	LD_64(a1, 0)
+	ST_64(a0, 0)
+	LD_64(a1, 64)
+	ST_64(a0, 64)
+
+	addi.d		a0, a0,  0x80
+	addi.d		a1, a1,  0x80
+	addi.d		a2, a2, -0x80
+	bge		a2, zero, loop_less
+
+end_unalign_proc:
+	addi.d		a2, a2, 0x80
+
+	pcaddi		t1, 36
+	andi		t2, a2, 0x78
+	add.d		a1, a1, t2
+	add.d		a0, a0, t2
+	sub.d		t1, t1, t2
+	jirl		zero, t1, 0
+
+end_120_128_unalign:
+	ld.d		t0, a1, -120
+	st.d		t0, a0, -120
+end_112_120_unalign:
+	ld.d		t0, a1, -112
+	st.d		t0, a0, -112
+end_104_112_unalign:
+	ld.d		t0, a1, -104
+	st.d		t0, a0, -104
+end_96_104_unalign:
+	ld.d		t0, a1, -96
+	st.d		t0, a0, -96
+end_88_96_unalign:
+	ld.d		t0, a1, -88
+	st.d		t0, a0, -88
+end_80_88_unalign:
+	ld.d		t0, a1, -80
+	st.d		t0, a0, -80
+end_72_80_unalign:
+	ld.d		t0, a1, -72
+	st.d		t0, a0, -72
+end_64_72_unalign:
+	ld.d		t0, a1, -64
+	st.d		t0, a0, -64
+end_56_64_unalign:
+	ld.d		t0, a1, -56
+	st.d		t0, a0, -56
+end_48_56_unalign:
+	ld.d		t0, a1, -48
+	st.d		t0, a0, -48
+end_40_48_unalign:
+	ld.d		t0, a1, -40
+	st.d		t0, a0, -40
+end_32_40_unalign:
+	ld.d		t0, a1, -32
+	st.d		t0, a0, -32
+end_24_32_unalign:
+	ld.d		t0, a1, -24
+	st.d		t0, a0, -24
+end_16_24_unalign:
+	ld.d		t0, a1, -16
+	st.d		t0, a0, -16
+end_8_16_unalign:
+	ld.d		t0, a1, -8
+	st.d		t0, a0, -8
+end_0_8_unalign:
+	andi		a2, a2, 0x7
+	pcaddi		t1, 18
+	slli.d		a2, a2, 3
+	sub.d		t1, t1, a2
+	jirl		zero, t1, 0
+end_7_unalign:
+	ld.b		t0, a4, -7
+	st.b		t0, a3, -7
+end_6_unalign:
+	ld.b		t0, a4, -6
+	st.b		t0, a3, -6
+end_5_unalign:
+	ld.b		t0, a4, -5
+	st.b		t0, a3, -5
+end_4_unalign:
+	ld.b		t0, a4, -4
+	st.b		t0, a3, -4
+end_3_unalign:
+	ld.b		t0, a4, -3
+	st.b		t0, a3, -3
+end_2_unalign:
+	ld.b		t0, a4, -2
+	st.b		t0, a3, -2
+end_1_unalign:
+	ld.b		t0, a4, -1
+	st.b		t0, a3, -1
+end:
+	move		v0, t8
+	jr		ra
+
+all_align:
+	addi.d		a1, a1, 0x8
+	addi.d		a0, a0, 0x8
+	ld.d		t0, a1, -8
+	st.d		t0, a0, -8
+	addi.d		a2, a2, -8
+	b		start_over
+
+all_align_back:
+	addi.d		a4, a4, -0x8
+	addi.d		a3, a3, -0x8
+	ld.d		t0, a4, 0
+	st.d		t0, a3, 0
+	addi.d		a2, a2, -8
+	b		start_over_back
+
+copy_backward:
+	move		a5, a3
+	srli.d		a3, a3, 3
+	slli.d		a3, a3, 3
+	beq		a3, a5, all_align_back
+	sub.d		a7, a3, a5
+	add.d		a4, a4, a7
+	add.d		a2, a7, a2
+
+	pcaddi		t1, 18
+	slli.d		a6, a7, 3
+	add.d		t1, t1, a6
+	jirl		zero, t1, 0
+
+	ld.b		t0, a4, 6
+	st.b		t0, a3, 6
+	ld.b		t0, a4, 5
+	st.b		t0, a3, 5
+	ld.b		t0, a4, 4
+	st.b		t0, a3, 4
+	ld.b		t0, a4, 3
+	st.b		t0, a3, 3
+	ld.b		t0, a4, 2
+	st.b		t0, a3, 2
+	ld.b		t0, a4, 1
+	st.b		t0, a3, 1
+	ld.b		t0, a4, 0
+	st.b		t0, a3, 0
+
+start_over_back:
+	addi.d		a2, a2, -0x80
+	blt		a2, zero, end_unalign_proc_back
+
+loop_less_back:
+	LD_64(a4, -64)
+	ST_64(a3, -64)
+	LD_64(a4, -128)
+	ST_64(a3, -128)
+
+	addi.d		a4, a4, -0x80
+	addi.d		a3, a3, -0x80
+	addi.d		a2, a2, -0x80
+	bge		a2, zero, loop_less_back
+
+end_unalign_proc_back:
+	addi.d		a2, a2, 0x80
+
+	pcaddi		t1, 36
+	andi		t2, a2, 0x78
+	sub.d		a4, a4, t2
+	sub.d		a3, a3, t2
+	sub.d		t1, t1, t2
+	jirl		zero, t1, 0
+
+	ld.d		t0, a4, 112
+	st.d		t0, a3, 112
+	ld.d		t0, a4, 104
+	st.d		t0, a3, 104
+	ld.d		t0, a4, 96
+	st.d		t0, a3, 96
+	ld.d		t0, a4, 88
+	st.d		t0, a3, 88
+	ld.d		t0, a4, 80
+	st.d		t0, a3, 80
+	ld.d		t0, a4, 72
+	st.d		t0, a3, 72
+	ld.d		t0, a4, 64
+	st.d		t0, a3, 64
+	ld.d		t0, a4, 56
+	st.d		t0, a3, 56
+	ld.d		t0, a4, 48
+	st.d		t0, a3, 48
+	ld.d		t0, a4, 40
+	st.d		t0, a3, 40
+	ld.d		t0, a4, 32
+	st.d		t0, a3, 32
+	ld.d		t0, a4, 24
+	st.d		t0, a3, 24
+	ld.d		t0, a4, 16
+	st.d		t0, a3, 16
+	ld.d		t0, a4, 8
+	st.d		t0, a3, 8
+	ld.d		t0, a4, 0
+	st.d		t0, a3, 0
+
+	andi		a2, a2, 0x7
+	pcaddi		t1, 18
+	slli.d		a2, a2, 3
+	sub.d		t1, t1, a2
+	jirl		zero, t1, 0
+
+	ld.b		t0, a1, 6
+	st.b		t0, a0, 6
+	ld.b		t0, a1, 5
+	st.b		t0, a0, 5
+	ld.b		t0, a1, 4
+	st.b		t0, a0, 4
+	ld.b		t0, a1, 3
+	st.b		t0, a0, 3
+	ld.b		t0, a1, 2
+	st.b		t0, a0, 2
+	ld.b		t0, a1, 1
+	st.b		t0, a0, 1
+	ld.b		t0, a1, 0
+	st.b		t0, a0, 0
+
+	move		v0, t8
+	jr		ra
+
+END(MEMMOVE_NAME)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMMOVE_NAME)
+#endif
+#endif
-- 
2.31.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-08-15  8:57 [PATCH 0/2] LoongArch: Add optimized functions caiyinyu
  2022-08-15  8:57 ` [PATCH 1/2] LoongArch: Add optimized string functions: str{chr, chrnul, cmp, ncmp} caiyinyu
  2022-08-15  8:57 ` [PATCH 2/2] LoongArch: Add optimized function: memmove caiyinyu
@ 2022-08-15 14:02 ` Carlos O'Donell
  2022-08-15 20:46   ` Joseph Myers
  2 siblings, 1 reply; 16+ messages in thread
From: Carlos O'Donell @ 2022-08-15 14:02 UTC (permalink / raw)
  To: caiyinyu, adhemerval.zanella, libc-alpha, i.swmail
  Cc: joseph_myers, xuchenghua

On 8/15/22 04:57, caiyinyu wrote:
> Tested on LoongArch machine: gcc 13.0.0, Linux kernel 5.19.0 rc2,
> binutils branch master 2eb132bdfb9.

Could you please post microbenchmark results for these changes?

How much faster are they from the generic versions?

It is important to show a baseline and the improvement, that way future developers
looking to improve on your work can refer back to these posted numbers for the
hardware you used.

-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-08-15 14:02 ` [PATCH 0/2] LoongArch: Add optimized functions Carlos O'Donell
@ 2022-08-15 20:46   ` Joseph Myers
  2022-09-02 12:27     ` Adhemerval Zanella Netto
  0 siblings, 1 reply; 16+ messages in thread
From: Joseph Myers @ 2022-08-15 20:46 UTC (permalink / raw)
  To: Carlos O'Donell
  Cc: caiyinyu, adhemerval.zanella, libc-alpha, i.swmail, xuchenghua

On Mon, 15 Aug 2022, Carlos O'Donell via Libc-alpha wrote:

> On 8/15/22 04:57, caiyinyu wrote:
> > Tested on LoongArch machine: gcc 13.0.0, Linux kernel 5.19.0 rc2,
> > binutils branch master 2eb132bdfb9.
> 
> Could you please post microbenchmark results for these changes?
> 
> How much faster are they from the generic versions?

Note that so far we haven't merged the improved generic string functions 
that were posted a while back 
(https://sourceware.org/legacy-ml/libc-alpha/2018-01/msg00318.html is the 
version linked from https://sourceware.org/glibc/wiki/NewPorts - don't 
know if it's the most recent version).  So even if assembly versions are 
better than the current generic string functions, they might not be better 
than improved generic versions with architecture-specific implementations 
of the headers to provide per-architecture tuning.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-08-15 20:46   ` Joseph Myers
@ 2022-09-02 12:27     ` Adhemerval Zanella Netto
       [not found]       ` <403f78f0-55d9-48cf-c62a-4a0462a76987@loongson.cn>
  0 siblings, 1 reply; 16+ messages in thread
From: Adhemerval Zanella Netto @ 2022-09-02 12:27 UTC (permalink / raw)
  To: Joseph Myers, Carlos O'Donell
  Cc: caiyinyu, libc-alpha, i.swmail, xuchenghua



On 15/08/22 17:46, Joseph Myers wrote:
> On Mon, 15 Aug 2022, Carlos O'Donell via Libc-alpha wrote:
> 
>> On 8/15/22 04:57, caiyinyu wrote:
>>> Tested on LoongArch machine: gcc 13.0.0, Linux kernel 5.19.0 rc2,
>>> binutils branch master 2eb132bdfb9.
>>
>> Could you please post microbenchmark results for these changes?
>>
>> How much faster are they from the generic versions?
> 
> Note that so far we haven't merged the improved generic string functions 
> that were posted a while back 
> (https://sourceware.org/legacy-ml/libc-alpha/2018-01/msg00318.html is the 
> version linked from https://sourceware.org/glibc/wiki/NewPorts - don't 
> know if it's the most recent version).  So even if assembly versions are 
> better than the current generic string functions, they might not be better 
> than improved generic versions with architecture-specific implementations 
> of the headers to provide per-architecture tuning.
> 

And it seems that some of this newer implementations does what my patch
basically does.  The memmove is an improvement since the generic code we
have does a internal libcall to memcpy (which some architecture optimizes
it by implementing memcpy and memmove on some TU to just do a branch 
instead of a function call).

I will rebase and resend my improved generic string, I think it would
yield very similar numbers to the str* assembly implementations proposed.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
       [not found]         ` <2022091910031722091613@loongson.cn>
@ 2022-09-19 20:16           ` Adhemerval Zanella Netto
  2022-09-20  9:54             ` Xi Ruoyao
  0 siblings, 1 reply; 16+ messages in thread
From: Adhemerval Zanella Netto @ 2022-09-19 20:16 UTC (permalink / raw)
  To: dengjianbo; +Cc: joseph, carlos, libc-alpha, i.swmail, xuchenghua, caiyinyu



On 18/09/22 23:03, dengjianbo@loongson.cn wrote:
> Hi Adhemerval,
> 
> Please kindly see the following link for the test results of comparing with 
> new generic version. 
> https://sourceware.org/pipermail/libc-alpha/2022-September/142016.html
> 
> Comparing with the previous patch, we further optimized strchr and
> strchrnul, 4 instructions was reduced before the loop.

Do you have any breakdown if either loop unrolling or missing string-fzi.h/
string-fza.h is what is making difference in string routines? 

Checking on last iteration [1], it seems that strchr is issuing 2 loads
on each loop iteration and using bit-manipulation instruction that I am
not sure compiler could emit with generic code. Maybe we can tune the
generic implementation to get similar performance, as Richard has done
for alpha, hppa, sh, and powerpc?

I am asking because from the brief description of the algorithm, the
general idea is essentially what my generic code aims to do (mask-off
initial bytes, use word-aligned load and vectorized compares, extract
final bytes), and I am hoping that architecture would provide 
string-fz{i,a}.h to get better code generation instead of pushing
for more and more hand-write assembly routines.

[1] https://patchwork.sourceware.org/project/glibc/patch/20220916071642.2822131-2-dengjianbo@loongson.cn/

> 
> Best regards,
> Deng jianbo
> From: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>
> Date: Fri, 2 Sep 2022 09:27:33 -0300
> To: Joseph Myers <joseph@codesourcery.com>, Carlos O'Donell <carlos@redhat.com>
> CC:caiyinyu <caiyinyu@loongson.cn>, libc-alpha@sourceware.org, i.swmail@xen0n.name, xuchenghua@loongson.cn
> Subject: Re: [PATCH 0/2] LoongArch: Add optimized functions.
> 
> On 15/08/22 17:46, Joseph Myers wrote:
> On Mon, 15 Aug 2022, Carlos O'Donell via Libc-alpha wrote:
> 
> On 8/15/22 04:57, caiyinyu wrote:
> Tested on LoongArch machine: gcc 13.0.0, Linux kernel 5.19.0 rc2,
> binutils branch master 2eb132bdfb9.
> 
> Could you please post microbenchmark results for these changes?
> 
> How much faster are they from the generic versions?
> 
> Note that so far we haven't merged the improved generic string functions that were posted a while back (https://sourceware.org/legacy-ml/libc-alpha/2018-01/msg00318.html is the version linked from https://sourceware.org/glibc/wiki/NewPorts - don't know if it's the most recent version). So even if assembly versions are better than the current generic string functions, they might not be better than improved generic versions with architecture-specific implementations of the headers to provide per-architecture tuning.
> 
> 
> And it seems that some of this newer implementations does what my patch
> basically does. The memmove is an improvement since the generic code we
> have does a internal libcall to memcpy (which some architecture optimizes
> it by implementing memcpy and memmove on some TU to just do a branch instead of a function call).
> 
> I will rebase and resend my improved generic string, I think it would
> yield very similar numbers to the str* assembly implementations proposed.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-19 20:16           ` Adhemerval Zanella Netto
@ 2022-09-20  9:54             ` Xi Ruoyao
  2022-09-22 18:05               ` Adhemerval Zanella Netto
  0 siblings, 1 reply; 16+ messages in thread
From: Xi Ruoyao @ 2022-09-20  9:54 UTC (permalink / raw)
  To: Adhemerval Zanella Netto, dengjianbo
  Cc: libc-alpha, caiyinyu, xuchenghua, i.swmail, joseph

On Mon, 2022-09-19 at 17:16 -0300, Adhemerval Zanella Netto via Libc-
alpha wrote:
> Do you have any breakdown if either loop unrolling or missing string-fzi.h/
> string-fza.h is what is making difference in string routines? 

It looks like there are some difficulties... LoongArch does not have a
dedicated instruction for finding a zero byte among the 8 bytes in a
register (I guess the LoongArch SIMD eXtension will provide such an
instruction, but the full LSX manual is not published yet and some
LoongArch processors may lack LSX).  So the assembly code submitted by
dengjianbo relies on a register to cache the bit pattern
0x0101010101010101.  We can't just rematerialize it (with 3
instructions) in has_zero or has_eq etc. or the performance will be
likely horribly bad.  

> Checking on last iteration [1], it seems that strchr is issuing 2 loads
> on each loop iteration and using bit-manipulation instruction that I am
> not sure compiler could emit with generic code. Maybe we can tune the
> generic implementation to get similar performance, as Richard has done
> for alpha, hppa, sh, and powerpc?
> 
> I am asking because from the brief description of the algorithm, the
> general idea is essentially what my generic code aims to do (mask-off
> initial bytes, use word-aligned load and vectorized compares, extract
> final bytes), and I am hoping that architecture would provide 
> string-fz{i,a}.h to get better code generation instead of pushing
> for more and more hand-write assembly routines.

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-20  9:54             ` Xi Ruoyao
@ 2022-09-22 18:05               ` Adhemerval Zanella Netto
  2022-09-26 13:49                 ` Xi Ruoyao
  0 siblings, 1 reply; 16+ messages in thread
From: Adhemerval Zanella Netto @ 2022-09-22 18:05 UTC (permalink / raw)
  To: Xi Ruoyao, dengjianbo; +Cc: libc-alpha, caiyinyu, xuchenghua, i.swmail, joseph



On 20/09/22 06:54, Xi Ruoyao wrote:
> On Mon, 2022-09-19 at 17:16 -0300, Adhemerval Zanella Netto via Libc-
> alpha wrote:
>> Do you have any breakdown if either loop unrolling or missing string-fzi.h/
>> string-fza.h is what is making difference in string routines? 
> 
> It looks like there are some difficulties... LoongArch does not have a
> dedicated instruction for finding a zero byte among the 8 bytes in a
> register (I guess the LoongArch SIMD eXtension will provide such an
> instruction, but the full LSX manual is not published yet and some
> LoongArch processors may lack LSX).  So the assembly code submitted by
> dengjianbo relies on a register to cache the bit pattern
> 0x0101010101010101.  We can't just rematerialize it (with 3
> instructions) in has_zero or has_eq etc. or the performance will be
> likely horribly bad.  

The 0x0101010101010101 is already created on find_zero_low (lsb), so creating
it again on another static inline function should provide enough information
to compiler to optimize the materialization to avoid doing it twice. So
maybe adding a LoongArch specific index_first_zero_eq should be suffice.

Maybe we can parametrize strchr with an extra function to do what the final
step does:

    op_t found = index_first_zero_eq (word, repeated_c);
    if (extractbyte (word, found) == c)
      return (char *) (word_ptr) + found;
    return NULL;

So LoongArch can reimplement it with a better strategy as well.

The idea is this generic implementation is exactly to find the missing spots
where C code could not produce the best instruction and parametrize in way
that allows each architecture to reimplement in the best way.

> 
>> Checking on last iteration [1], it seems that strchr is issuing 2 loads
>> on each loop iteration and using bit-manipulation instruction that I am
>> not sure compiler could emit with generic code. Maybe we can tune the
>> generic implementation to get similar performance, as Richard has done
>> for alpha, hppa, sh, and powerpc?
>>
>> I am asking because from the brief description of the algorithm, the
>> general idea is essentially what my generic code aims to do (mask-off
>> initial bytes, use word-aligned load and vectorized compares, extract
>> final bytes), and I am hoping that architecture would provide 
>> string-fz{i,a}.h to get better code generation instead of pushing
>> for more and more hand-write assembly routines.
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-22 18:05               ` Adhemerval Zanella Netto
@ 2022-09-26 13:49                 ` Xi Ruoyao
  2022-09-28 14:22                   ` Richard Henderson
  2022-09-29 11:45                   ` Adhemerval Zanella Netto
  0 siblings, 2 replies; 16+ messages in thread
From: Xi Ruoyao @ 2022-09-26 13:49 UTC (permalink / raw)
  To: Adhemerval Zanella Netto, dengjianbo
  Cc: libc-alpha, caiyinyu, xuchenghua, i.swmail, joseph

[-- Attachment #1: Type: text/plain, Size: 762 bytes --]

Hi Adhemerval and Jianbo,

I've customized string-fzi.h and string-maskoff.h for LoongArch (see
attachment).  With them on top of Adhermerval's v5 "Improve generic
string routines" patch and GCC & Binutils trunk, the benchmark result
seems comparable with the assembly version for strchr, strcmp, and
strchrnul.

By the way I've tried to unroll the loop in strchr manually, but then
the compiler produced some bad thing (moving words from a register to
another with no reason) and the result is slower.

I've not really plotted the the result, just took a quick look with my
eyes.  You can try the bench with my headers in sysdeps/loongarch.
> 

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

[-- Attachment #2: string-maskoff.h --]
[-- Type: text/x-chdr, Size: 2955 bytes --]

/* Mask off bits.  LoongArch version.
   Copyright (C) 2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _STRING_MASKOFF_H
#define _STRING_MASKOFF_H 1

#include <endian.h>
#include <limits.h>
#include <stdint.h>
#include <string-optype.h>

/* Provide a mask based on the pointer alignment that sets up non-zero
   bytes before the beginning of the word.  It is used to mask off
   undesirable bits from an aligned read from an unaligned pointer.
   For instance, on a 64 bits machine with a pointer alignment of
   3 the function returns 0x0000000000ffffff for LE and 0xffffff0000000000
   (meaning to mask off the initial 3 bytes).  */
static inline op_t
create_mask (uintptr_t i)
{
  i = i % sizeof (op_t);
  return ~(((op_t)-1) << (i * CHAR_BIT));
}

/* Setup an word with each byte being c_in.  For instance, on a 64 bits
   machine with input as 0xce the functions returns 0xcececececececece.  */
static inline op_t
repeat_bytes (unsigned char c_in)
{
  op_t r = c_in * 0x01010101;

  _Static_assert (sizeof (op_t) == 4 || sizeof (op_t) == 8,
		  "unsupported op_t size");

  if (sizeof (op_t) == 8)
    asm ("bstrins.d\t%0, %0, 63, 32" : "+r" (r));

  return r;
}

/* Based on mask created by 'create_mask', mask off the high bit of each
   byte in the mask.  It is used to mask off undesirable bits from an
   aligned read from an unaligned pointer, and also taking care to avoid
   match possible bytes meant to be matched.  For instance, on a 64 bits
   machine with a mask created from a pointer with an alignment of 3
   (0x0000000000ffffff) the function returns 0x7f7f7f0000000000 for BE
   and 0x00000000007f7f7f for LE.  */
static inline op_t
highbit_mask (op_t m)
{
  return m & repeat_bytes (0x7f);
}

/* Return the address of the op_t word containing the address P.  For
   instance on address 0x0011223344556677 and op_t with size of 8,
   it returns 0x0011223344556670.  */
static inline op_t *
word_containing (char const *p)
{
  _Static_assert (sizeof (op_t) == 4 || sizeof (op_t) == 8,
		  "unsupported op_t size");

  if (sizeof (op_t) == 8)
    asm ("bstrins.d\t%0, $zero, 2, 0" : "+r" (p));
  else
    asm ("bstrins.d\t%0, $zero, 1, 0" : "+r" (p));
  return (op_t *) p;
}

#endif /* _STRING_MASKOFF_H  */

[-- Attachment #3: string-fzi.h --]
[-- Type: text/x-chdr, Size: 2825 bytes --]

/* Zero byte detection; indexes.  LoongArch version.
   Copyright (C) 2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _STRING_FZI_H
#define _STRING_FZI_H 1

#include <limits.h>
#include <endian.h>
#include <string-fza.h>
#include <gmp.h>
#include <stdlib/gmp-impl.h>
#include <stdlib/longlong.h>

/* A subroutine for the index_zero functions.  Given a test word C, return
   the (memory order) index of the first byte (in memory order) that is
   non-zero.  */
static inline unsigned int
index_first_ (op_t c)
{
  _Static_assert (sizeof (op_t) == sizeof (long), "op_t must be long");

  return __builtin_ctzl (c) / CHAR_BIT;
}

/* Similarly, but return the (memory order) index of the last byte that is
   non-zero.  */
static inline unsigned int
index_last_ (op_t c)
{
  _Static_assert (sizeof (op_t) == sizeof (long), "op_t must be long");

  return sizeof (op_t) - 1 - (__builtin_clzl (c) / CHAR_BIT);
}

/* Given a word X that is known to contain a zero byte, return the index of
   the first such within the word in memory order.  */
static inline unsigned int
index_first_zero (op_t x)
{
  x = find_zero_low (x);
  return index_first_ (x);
}

/* Similarly, but perform the search for byte equality between X1 and X2.  */
static inline unsigned int
index_first_eq (op_t x1, op_t x2)
{
  x1 = find_eq_low (x1, x2);
  return index_first_ (x1);
}

/* Similarly, but perform the search for zero within X1 or equality between
   X1 and X2.  */
static inline unsigned int
index_first_zero_eq (op_t x1, op_t x2)
{
  x1 = find_zero_eq_low (x1, x2);
  return index_first_ (x1);
}

/* Similarly, but perform the search for zero within X1 or inequality between
   X1 and X2.  */
static inline unsigned int
index_first_zero_ne (op_t x1, op_t x2)
{
  x1 = find_zero_ne_low (x1, x2);
  return index_first_ (x1);
}

/* Similarly, but search for the last zero within X.  */
static inline unsigned int
index_last_zero (op_t x)
{
  x = find_zero_all (x);
  return index_last_ (x);
}

static inline unsigned int
index_last_eq (op_t x1, op_t x2)
{
  return index_last_zero (x1 ^ x2);
}

#endif /* STRING_FZI_H */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-26 13:49                 ` Xi Ruoyao
@ 2022-09-28 14:22                   ` Richard Henderson
  2022-09-28 16:42                     ` Xi Ruoyao
  2022-09-29 11:45                   ` Adhemerval Zanella Netto
  1 sibling, 1 reply; 16+ messages in thread
From: Richard Henderson @ 2022-09-28 14:22 UTC (permalink / raw)
  To: Xi Ruoyao, Adhemerval Zanella Netto, dengjianbo
  Cc: xuchenghua, i.swmail, libc-alpha, joseph, caiyinyu

On 9/26/22 06:49, Xi Ruoyao via Libc-alpha wrote:
> Hi Adhemerval and Jianbo,
> 
> I've customized string-fzi.h and string-maskoff.h for LoongArch (see
> attachment).  With them on top of Adhermerval's v5 "Improve generic
> string routines" patch and GCC & Binutils trunk, the benchmark result
> seems comparable with the assembly version for strchr, strcmp, and
> strchrnul.

There is nothing in string-maskoff.h that the compiler should not be able to produce 
itself from the generic version.  Having a brief look, the compiler simply needs to be 
improved to unify two current AND patterns (which is an existing bug) and add the 
additional case for bstrins.d.

Similarly, there is nothing in string-fzi.h that should not be gotten from longlong.h; 
your only changes are to use __builtin_clz, which longlong.h exports as count_trailing_zeros.


r~

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-28 14:22                   ` Richard Henderson
@ 2022-09-28 16:42                     ` Xi Ruoyao
  2022-09-28 19:18                       ` Richard Henderson
  2022-09-29  3:00                       ` Lulu Cheng
  0 siblings, 2 replies; 16+ messages in thread
From: Xi Ruoyao @ 2022-09-28 16:42 UTC (permalink / raw)
  To: Richard Henderson, Adhemerval Zanella Netto, dengjianbo
  Cc: xuchenghua, i.swmail, libc-alpha, joseph, caiyinyu, Lulu Cheng

On Wed, 2022-09-28 at 07:22 -0700, Richard Henderson wrote:
> On 9/26/22 06:49, Xi Ruoyao via Libc-alpha wrote:
> > Hi Adhemerval and Jianbo,
> > 
> > I've customized string-fzi.h and string-maskoff.h for LoongArch (see
> > attachment).  With them on top of Adhermerval's v5 "Improve generic
> > string routines" patch and GCC & Binutils trunk, the benchmark result
> > seems comparable with the assembly version for strchr, strcmp, and
> > strchrnul.

Hi Richard,

> There is nothing in string-maskoff.h that the compiler should not be able to produce 
> itself from the generic version.  Having a brief look, the compiler simply needs to be 
> improved to unify two current AND patterns (which is an existing bug) and add the 
> additional case for bstrins.d.

Added GCC LoongArch port maintainer into Cc:.

It's actually more complicated.  Without the inline assembly in
repeat_bytes(), the compiler does not hoist the 4-instruction 64-bit
immediate load sequence out of a loop for "some reason I don't know
yet".

> Similarly, there is nothing in string-fzi.h that should not be gotten from longlong.h; 
> your only changes are to use __builtin_clz, which longlong.h exports as count_trailing_zeros.

No, it does not.  By default longlong.h uses a table driven approach for
count_{trailing,leading}_zeros.  I can add __loongarch__ (or
__loongarch_lp64) into longlong.h though.  IIUC I need to submit the
change to GCC, then Glibc merges longlong.h from GCC, right?

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-28 16:42                     ` Xi Ruoyao
@ 2022-09-28 19:18                       ` Richard Henderson
  2022-10-10  1:39                         ` Lulu Cheng
  2022-09-29  3:00                       ` Lulu Cheng
  1 sibling, 1 reply; 16+ messages in thread
From: Richard Henderson @ 2022-09-28 19:18 UTC (permalink / raw)
  To: Xi Ruoyao, Adhemerval Zanella Netto, dengjianbo
  Cc: xuchenghua, i.swmail, libc-alpha, joseph, caiyinyu, Lulu Cheng

On 9/28/22 09:42, Xi Ruoyao wrote:
>> There is nothing in string-maskoff.h that the compiler should not be able to produce
>> itself from the generic version.  Having a brief look, the compiler simply needs to be
>> improved to unify two current AND patterns (which is an existing bug) and add the
>> additional case for bstrins.d.
> 
> Added GCC LoongArch port maintainer into Cc:.
> 
> It's actually more complicated.  Without the inline assembly in
> repeat_bytes(), the compiler does not hoist the 4-instruction 64-bit
> immediate load sequence out of a loop for "some reason I don't know
> yet".

Oh that's interesting.  I suspect that adding a REG_EQUAL note for each (or perhaps just 
the last) insn emitted by loongarch_move_integer would fix that.

>> Similarly, there is nothing in string-fzi.h that should not be gotten from longlong.h;
>> your only changes are to use __builtin_clz, which longlong.h exports as count_trailing_zeros.
> 
> No, it does not.  By default longlong.h uses a table driven approach for
> count_{trailing,leading}_zeros.  I can add __loongarch__ (or
> __loongarch_lp64) into longlong.h though.

That would best, as that header is used many more places.

>  IIUC I need to submit the change to GCC, then Glibc merges longlong.h from GCC, right?

I believe that's correct.


r~

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-28 16:42                     ` Xi Ruoyao
  2022-09-28 19:18                       ` Richard Henderson
@ 2022-09-29  3:00                       ` Lulu Cheng
  1 sibling, 0 replies; 16+ messages in thread
From: Lulu Cheng @ 2022-09-29  3:00 UTC (permalink / raw)
  To: Xi Ruoyao, Richard Henderson, Adhemerval Zanella Netto, dengjianbo
  Cc: xuchenghua, i.swmail, libc-alpha, joseph, caiyinyu

> Added GCC LoongArch port maintainer into Cc:.
>
> It's actually more complicated.  Without the inline assembly in
> repeat_bytes(), the compiler does not hoist the 4-instruction 64-bit
> immediate load sequence out of a loop for "some reason I don't know
> yet".
>
Ok, I understand the problem described, I will debug the problem.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-26 13:49                 ` Xi Ruoyao
  2022-09-28 14:22                   ` Richard Henderson
@ 2022-09-29 11:45                   ` Adhemerval Zanella Netto
  1 sibling, 0 replies; 16+ messages in thread
From: Adhemerval Zanella Netto @ 2022-09-29 11:45 UTC (permalink / raw)
  To: Xi Ruoyao, dengjianbo; +Cc: libc-alpha, caiyinyu, xuchenghua, i.swmail, joseph



On 26/09/22 10:49, Xi Ruoyao wrote:
> Hi Adhemerval and Jianbo,
> 
> I've customized string-fzi.h and string-maskoff.h for LoongArch (see
> attachment).  With them on top of Adhermerval's v5 "Improve generic
> string routines" patch and GCC & Binutils trunk, the benchmark result
> seems comparable with the assembly version for strchr, strcmp, and
> strchrnul.

Excellent, that's exactly what I was aiming for this generic string.
I see that Richard's sees that gcc should handle it better, although
I do not oppose adding this header to support older gcc versions.

> 
> By the way I've tried to unroll the loop in strchr manually, but then
> the compiler produced some bad thing (moving words from a register to
> another with no reason) and the result is slower.

Have you tried to use compiler flags to explicit use loops unrolls:

CFLAGS-strchr.c += --param max-variable-expansions-in-unroller=2 --param max-unroll-times=2 -funroll-loops 

We use this on powerpc, so it might worth a try.

Another options would be to use loop_unroll.h and add a per-arch flag
to define the expansion factor.  I would prefer if we could set this
by compiler.

> 
> I've not really plotted the the result, just took a quick look with my
> eyes.  You can try the bench with my headers in sysdeps/loongarch.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] LoongArch: Add optimized functions.
  2022-09-28 19:18                       ` Richard Henderson
@ 2022-10-10  1:39                         ` Lulu Cheng
  0 siblings, 0 replies; 16+ messages in thread
From: Lulu Cheng @ 2022-10-10  1:39 UTC (permalink / raw)
  To: Richard Henderson, Xi Ruoyao, Adhemerval Zanella Netto, dengjianbo
  Cc: xuchenghua, i.swmail, libc-alpha, joseph, caiyinyu

[-- Attachment #1: Type: text/plain, Size: 1068 bytes --]


在 2022/9/29 上午3:18, Richard Henderson 写道:
> On 9/28/22 09:42, Xi Ruoyao wrote:
>>> There is nothing in string-maskoff.h that the compiler should not be 
>>> able to produce
>>> itself from the generic version.  Having a brief look, the compiler 
>>> simply needs to be
>>> improved to unify two current AND patterns (which is an existing 
>>> bug) and add the
>>> additional case for bstrins.d.
>>
>> Added GCC LoongArch port maintainer into Cc:.
>>
>> It's actually more complicated.  Without the inline assembly in
>> repeat_bytes(), the compiler does not hoist the 4-instruction 64-bit
>> immediate load sequence out of a loop for "some reason I don't know
>> yet".
>
> Oh that's interesting.  I suspect that adding a REG_EQUAL note for 
> each (or perhaps just the last) insn emitted by loongarch_move_integer 
> would fix that.
>
>
I have optimized this problem, and the modification is in the 
attachment. However, in some cases, the immediate count load will be two 
more than the

original implementation instruction. I'm still working on that.


[-- Attachment #2: 0001-LoongArch-Optimize-immediate-load.patch --]
[-- Type: text/x-patch, Size: 4775 bytes --]

From 093865cc12334ed3f2db42e3c7b19b1d7ef4559a Mon Sep 17 00:00:00 2001
From: Lulu Cheng <chenglulu@loongson.cn>
Date: Sun, 9 Oct 2022 17:54:38 +0800
Subject: [PATCH] LoongArch: Optimize immediate load.

Optimize the link of https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html
said in a number of repeated loading immediately.

gcc/ChangeLog:

	* config/loongarch/loongarch.cc (struct loongarch_integer_op):
	Add the member curr_value to the structure to represent the result
	of the immediate count of the load instruction at each step.
	(loongarch_build_integer): Assign a value to the member curr_value.
	(loongarch_move_integer): Optimize immediate load.
---
 gcc/config/loongarch/loongarch.cc | 57 ++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 70918d41860..38d822bcd49 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -154,7 +154,11 @@ enum loongarch_load_imm_method
 struct loongarch_integer_op
 {
   enum rtx_code code;
+  /* Current Immediate Count The immediate count of the load instruction.  */
   HOST_WIDE_INT value;
+  /* Represent the result of the immediate count of the load instruction at
+     each step.  */
+  HOST_WIDE_INT curr_value;
   enum loongarch_load_imm_method method;
 };
 
@@ -1523,24 +1527,27 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
     {
       /* The value of the lower 32 bit be loaded with one instruction.
 	 lu12i.w.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part;
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part;
+      codes[cost].curr_value = low_part;
       cost++;
     }
   else
     {
       /* lu12i.w + ior.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].curr_value = codes[cost].value;
       cost++;
       HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
       if (iorv != 0)
 	{
-	  codes[1].code = IOR;
-	  codes[1].method = METHOD_NORMAL;
-	  codes[1].value = iorv;
+	  codes[cost].code = IOR;
+	  codes[cost].method = METHOD_NORMAL;
+	  codes[cost].value = iorv;
+	  codes[cost].curr_value = low_part;
 	  cost++;
 	}
     }
@@ -1563,11 +1570,15 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
 	{
 	  codes[cost].method = METHOD_LU52I;
 	  codes[cost].value = value & LU52I_B;
+	  codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+							0xfffffffffffff);
 	  return cost + 1;
 	}
 
       codes[cost].method = METHOD_LU32I;
       codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
+      codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+						    0xffffffff);
       cost++;
 
       /* Determine whether the 52-61 bits are sign-extended from the low order,
@@ -1576,6 +1587,8 @@ loongarch_build_integer (struct loongarch_integer_op *codes,
 	{
 	  codes[cost].method = METHOD_LU52I;
 	  codes[cost].value = value & LU52I_B;
+	  codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+							0xfffffffffffff);
 	  cost++;
 	}
     }
@@ -2959,29 +2972,27 @@ loongarch_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
       else
 	x = force_reg (mode, x);
 
+      set_unique_reg_note (get_last_insn (), REG_EQUAL, GEN_INT (codes[i-1].curr_value));
+
       switch (codes[i].method)
 	{
 	case METHOD_NORMAL:
+	  /* mov or ior.  */
 	  x = gen_rtx_fmt_ee (codes[i].code, mode, x,
 			      GEN_INT (codes[i].value));
 	  break;
 	case METHOD_LU32I:
-	  emit_insn (
-	    gen_rtx_SET (x,
-			 gen_rtx_IOR (DImode,
-				      gen_rtx_ZERO_EXTEND (
-					DImode, gen_rtx_SUBREG (SImode, x, 0)),
-				      GEN_INT (codes[i].value))));
+	  gcc_assert (mode == DImode);
+	  /* lu32i_d */
+	  x = gen_rtx_IOR (mode, gen_rtx_ZERO_EXTEND (mode,
+						gen_rtx_SUBREG (SImode, x, 0)),
+			   GEN_INT (codes[i].value));
 	  break;
 	case METHOD_LU52I:
-	  emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
-				  GEN_INT (codes[i].value)));
-	  break;
-	case METHOD_INSV:
-	  emit_insn (
-	    gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT (20),
-					       GEN_INT (32)),
-			 gen_rtx_REG (DImode, 0)));
+	  gcc_assert (mode == DImode);
+	  /*lu52i_d*/
+	  x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT (0xfffffffffffff)),
+			   GEN_INT (codes[i].value));
 	  break;
 	default:
 	  gcc_unreachable ();
-- 
2.31.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-10-10  1:39 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-15  8:57 [PATCH 0/2] LoongArch: Add optimized functions caiyinyu
2022-08-15  8:57 ` [PATCH 1/2] LoongArch: Add optimized string functions: str{chr, chrnul, cmp, ncmp} caiyinyu
2022-08-15  8:57 ` [PATCH 2/2] LoongArch: Add optimized function: memmove caiyinyu
2022-08-15 14:02 ` [PATCH 0/2] LoongArch: Add optimized functions Carlos O'Donell
2022-08-15 20:46   ` Joseph Myers
2022-09-02 12:27     ` Adhemerval Zanella Netto
     [not found]       ` <403f78f0-55d9-48cf-c62a-4a0462a76987@loongson.cn>
     [not found]         ` <2022091910031722091613@loongson.cn>
2022-09-19 20:16           ` Adhemerval Zanella Netto
2022-09-20  9:54             ` Xi Ruoyao
2022-09-22 18:05               ` Adhemerval Zanella Netto
2022-09-26 13:49                 ` Xi Ruoyao
2022-09-28 14:22                   ` Richard Henderson
2022-09-28 16:42                     ` Xi Ruoyao
2022-09-28 19:18                       ` Richard Henderson
2022-10-10  1:39                         ` Lulu Cheng
2022-09-29  3:00                       ` Lulu Cheng
2022-09-29 11:45                   ` Adhemerval Zanella Netto

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).