public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Christoph Muellner <christoph.muellner@vrull.eu>
To: libc-alpha@sourceware.org, Palmer Dabbelt <palmer@dabbelt.com>,
	Darius Rad <darius@bluespec.com>,
	Andrew Waterman <andrew@sifive.com>, DJ Delorie <dj@redhat.com>,
	Vineet Gupta <vineetg@rivosinc.com>,
	Kito Cheng <kito.cheng@sifive.com>,
	Jeff Law <jeffreyalaw@gmail.com>,
	Philipp Tomsich <philipp.tomsich@vrull.eu>,
	Heiko Stuebner <heiko.stuebner@vrull.eu>
Cc: "Christoph Müllner" <christoph.muellner@vrull.eu>
Subject: [RFC PATCH 16/19] riscv: Add accelerated strcmp routines
Date: Tue,  7 Feb 2023 01:16:15 +0100	[thread overview]
Message-ID: <20230207001618.458947-17-christoph.muellner@vrull.eu> (raw)
In-Reply-To: <20230207001618.458947-1-christoph.muellner@vrull.eu>

From: Christoph Müllner <christoph.muellner@vrull.eu>

The implementation of strcmp() can be accelerated using Zbb's orc.b
instruction and fast unaligned accesses. Howver, strcmp can use
unaligned accesses only if such an address does not change the
exception behaviour (compared to a single-byte compare loop).
Let's add an implementation that does all that.
Additionally, let's add the strcmp implementation from the
Bitmanip specification, which does not do any unaligned accesses.

Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
---
 sysdeps/riscv/multiarch/Makefile              |   4 +-
 sysdeps/riscv/multiarch/ifunc-impl-list.c     |   4 +-
 sysdeps/riscv/multiarch/strcmp.c              |  11 +-
 sysdeps/riscv/multiarch/strcmp_zbb.S          | 104 +++++++++
 .../riscv/multiarch/strcmp_zbb_unaligned.S    | 213 ++++++++++++++++++
 5 files changed, 332 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb.S
 create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S

diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
index 3017bde75a..73a62be85d 100644
--- a/sysdeps/riscv/multiarch/Makefile
+++ b/sysdeps/riscv/multiarch/Makefile
@@ -11,5 +11,7 @@ sysdep_routines += \
 	strlen_generic \
 	strlen_zbb \
 	\
-	strcmp_generic
+	strcmp_generic \
+	strcmp_zbb \
+	strcmp_zbb_unaligned
 endif
diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
index 64331a4c7f..d354aa1178 100644
--- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
+++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
@@ -59,7 +59,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
 
   IFUNC_IMPL (i, name, strcmp,
-	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcmp_generic))
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
 
   return i;
 }
diff --git a/sysdeps/riscv/multiarch/strcmp.c b/sysdeps/riscv/multiarch/strcmp.c
index 8c21a90afd..d3f2fe19ae 100644
--- a/sysdeps/riscv/multiarch/strcmp.c
+++ b/sysdeps/riscv/multiarch/strcmp.c
@@ -30,8 +30,15 @@
 
 extern __typeof (__redirect_strcmp) __libc_strcmp;
 extern __typeof (__redirect_strcmp) __strcmp_generic attribute_hidden;
-
-libc_ifunc (__libc_strcmp, __strcmp_generic);
+extern __typeof (__redirect_strcmp) __strcmp_zbb attribute_hidden;
+extern __typeof (__redirect_strcmp) __strcmp_zbb_unaligned attribute_hidden;
+
+libc_ifunc (__libc_strcmp,
+	    HAVE_RV(zbb) && HAVE_FAST_UNALIGNED()
+	    ? __strcmp_zbb_unaligned
+	    : HAVE_RV(zbb)
+	      ? __strcmp_zbb
+	      : __strcmp_generic);
 
 # undef strcmp
 strong_alias (__libc_strcmp, strcmp);
diff --git a/sysdeps/riscv/multiarch/strcmp_zbb.S b/sysdeps/riscv/multiarch/strcmp_zbb.S
new file mode 100644
index 0000000000..1c265d6107
--- /dev/null
+++ b/sysdeps/riscv/multiarch/strcmp_zbb.S
@@ -0,0 +1,104 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* Assumptions: rvi_zbb.  */
+/* Implementation from the Bitmanip specification.  */
+
+#define src1		a0
+#define result		a0
+#define src2		a1
+#define data1		a2
+#define data2		a3
+#define align		a4
+#define data1_orcb	t0
+#define m1		t2
+
+#if __riscv_xlen == 64
+# define REG_L	ld
+# define SZREG	8
+#else
+# define REG_L	lw
+# define SZREG	4
+#endif
+
+#ifndef STRCMP
+# define STRCMP __strcmp_zbb
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY_ALIGN (STRCMP, 6)
+	or	align, src1, src2
+	and	align, align, SZREG-1
+	bnez	align, L(simpleloop)
+	li	m1, -1
+
+	/* Main loop for aligned strings.  */
+	.p2align 2
+L(loop):
+	REG_L	data1, 0(src1)
+	REG_L	data2, 0(src2)
+	orc.b	data1_orcb, data1
+	bne	data1_orcb, m1, L(foundnull)
+	addi	src1, src1, SZREG
+	addi	src2, src2, SZREG
+	beq	data1, data2, L(loop)
+
+	/* Words don't match, and no null byte in the first word.
+	 * Get bytes in big-endian order and compare.  */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	data1, data1
+	rev8	data2, data2
+#endif
+	/* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
+	sltu	result, data1, data2
+	neg	result, result
+	ori	result, result, 1
+	ret
+
+L(foundnull):
+	/* Found a null byte.
+	 * If words don't match, fall back to simple loop.  */
+	bne	data1, data2, L(simpleloop)
+
+	/* Otherwise, strings are equal.  */
+	li	result, 0
+	ret
+
+	/* Simple loop for misaligned strings.  */
+	.p2align 3
+L(simpleloop):
+	lbu	data1, 0(src1)
+	lbu	data2, 0(src2)
+	addi	src1, src1, 1
+	addi	src2, src2, 1
+	bne	data1, data2, L(sub)
+	bnez	data1, L(simpleloop)
+
+L(sub):
+	sub	result, data1, data2
+	ret
+
+.option pop
+
+END (STRCMP)
+libc_hidden_builtin_def (STRCMP)
diff --git a/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
new file mode 100644
index 0000000000..ec21982b65
--- /dev/null
+++ b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
@@ -0,0 +1,213 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* Assumptions: rvi_zbb with fast unaligned access.  */
+/* Implementation inspired by aarch64/strcmp.S.  */
+
+#define src1		a0
+#define result		a0
+#define src2		a1
+#define off		a3
+#define m1		a4
+#define align1		a5
+#define src3		a6
+#define tmp		a7
+
+#define data1		t0
+#define data2		t1
+#define b1		t0
+#define b2		t1
+#define data3		t2
+#define data1_orcb	t3
+#define data3_orcb	t4
+#define shift		t5
+
+#if __riscv_xlen == 64
+# define REG_L	ld
+# define SZREG	8
+# define PTRLOG	3
+#else
+# define REG_L	lw
+# define SZREG	4
+# define PTRLOG	2
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# error big endian is untested!
+# define CZ	ctz
+# define SHIFT	srl
+# define SHIFT2	sll
+#else
+# define CZ	ctz
+# define SHIFT	sll
+# define SHIFT2	srl
+#endif
+
+#ifndef STRCMP
+# define STRCMP __strcmp_zbb_unaligned
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY_ALIGN (STRCMP, 6)
+	/* off...delta from src1 to src2.  */
+	sub	off, src2, src1
+	li	m1, -1
+	andi	tmp, off, SZREG-1
+	andi	align1, src1, SZREG-1
+	bnez	tmp, L(misaligned8)
+	bnez	align1, L(mutual_align)
+
+	.p2align 4
+L(loop_aligned):
+	REG_L	data1, 0(src1)
+	add	tmp, src1, off
+	addi	src1, src1, SZREG
+	REG_L	data2, 0(tmp)
+
+L(start_realigned):
+	orc.b	data1_orcb, data1
+	bne	data1_orcb, m1, L(end)
+	beq	data1, data2, L(loop_aligned)
+
+L(fast_end):
+	/* Words don't match, and no NUL byte in one word.
+	   Get bytes in big-endian order and compare as words.  */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	data1, data1
+	rev8	data2, data2
+#endif
+	/* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
+	sltu	result, data1, data2
+	neg	result, result
+	ori	result, result, 1
+	ret
+
+L(end_orc):
+	orc.b	data1_orcb, data1
+L(end):
+	/* Words don't match or NUL byte in at least one word.
+	   data1_orcb holds orc.b value of data1.  */
+	xor	tmp, data1, data2
+	orc.b	tmp, tmp
+
+	orn	tmp, tmp, data1_orcb
+	CZ	shift, tmp
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	data1, data1
+	rev8	data2, data2
+#endif
+	sll	data1, data1, shift
+	sll	data2, data2, shift
+	srl	b1, data1, SZREG*8-8
+	srl	b2, data2, SZREG*8-8
+
+L(end_singlebyte):
+	sub	result, b1, b2
+	ret
+
+	.p2align 4
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	andi	src1, src1, -SZREG
+	add	tmp, src1, off
+	REG_L	data1, 0(src1)
+	addi	src1, src1, SZREG
+	REG_L	data2, 0(tmp)
+	/* Get number of bits to mask.  */
+	sll	shift, src2, 3
+	/* Bits to mask are now 0, others are 1.  */
+	SHIFT	tmp, m1, shift
+	/* Or with inverted value -> masked bits become 1.  */
+	orn	data1, data1, tmp
+	orn	data2, data2, tmp
+	j	L(start_realigned)
+
+L(misaligned8):
+	/* Skip slow loop if SRC1 is aligned.  */
+	beqz	align1, L(src1_aligned)
+L(do_misaligned):
+	/* Align SRC1 to 8 bytes.  */
+	lbu	b1, 0(src1)
+	lbu	b2, 0(src2)
+	beqz	b1, L(end_singlebyte)
+	bne	b1, b2, L(end_singlebyte)
+	addi	src1, src1, 1
+	addi	src2, src2, 1
+	andi	align1, src1, SZREG-1
+	bnez	align1, L(do_misaligned)
+
+L(src1_aligned):
+	/* SRC1 is aligned. Align SRC2 down and check for NUL there.
+	 * If there is no NUL, we may read the next word from SRC2.
+	 * If there is a NUL, we must not read a complete word from SRC2
+	 * because we might cross a page boundary.  */
+	/* Get number of bits to mask (upper bits are ignored by shifts).  */
+	sll	shift, src2, 3
+	/* src3 := align_down (src2)  */
+	andi	src3, src2, -SZREG
+	REG_L   data3, 0(src3)
+	addi	src3, src3, SZREG
+
+	/* Bits to mask are now 0, others are 1.  */
+	SHIFT	tmp, m1, shift
+	/* Or with inverted value -> masked bits become 1.  */
+	orn	data3_orcb, data3, tmp
+	/* Check for NUL in next aligned word.  */
+	orc.b	data3_orcb, data3_orcb
+	bne	data3_orcb, m1, L(unaligned_nul)
+
+	.p2align 4
+L(loop_unaligned):
+	/* Read the (aligned) data1 and the unaligned data2.  */
+	REG_L	data1, 0(src1)
+	addi	src1, src1, SZREG
+	REG_L	data2, 0(src2)
+	addi	src2, src2, SZREG
+	orc.b	data1_orcb, data1
+	bne	data1_orcb, m1, L(end)
+	bne	data1, data2, L(end)
+
+	/* Read the next aligned-down word.  */
+	REG_L	data3, 0(src3)
+	addi	src3, src3, SZREG
+	orc.b	data3_orcb, data3
+	beq	data3_orcb, m1, L(loop_unaligned)
+
+L(unaligned_nul):
+	/* src1 points to unread word (only first bytes relevant).
+	 * data3 holds next aligned-down word with NUL.
+	 * Compare the first bytes of data1 with the last bytes of data3.  */
+	REG_L	data1, 0(src1)
+	/* Shift NUL bytes into data3 to become data2.  */
+	SHIFT2	data2, data3, shift
+	bne	data1, data2, L(end_orc)
+	li	result, 0
+	ret
+
+.option pop
+
+END (STRCMP)
+libc_hidden_builtin_def (STRCMP)
-- 
2.39.1


  parent reply	other threads:[~2023-02-07  0:16 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-07  0:15 [RFC PATCH 00/19] riscv: ifunc support with optimized mem*/str*/cpu_relax routines Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 01/19] Inhibit early libcalls before ifunc support is ready Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 02/19] riscv: LEAF: Use C_LABEL() to construct the asm name for a C symbol Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 03/19] riscv: Add ENTRY_ALIGN() macro Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 04/19] riscv: Add hart feature run-time detection framework Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 05/19] riscv: Introduction of ISA extensions Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 06/19] riscv: Adding ISA string parser for environment variables Christoph Muellner
2023-02-07  6:20   ` David Abdurachmanov
2023-02-07  0:16 ` [RFC PATCH 07/19] riscv: hart-features: Add fast_unaligned property Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 08/19] riscv: Add (empty) ifunc framework Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 09/19] riscv: Add ifunc support for memset Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 10/19] riscv: Add accelerated memset routines for RV64 Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 11/19] riscv: Add ifunc support for memcpy/memmove Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 12/19] riscv: Add accelerated memcpy/memmove routines for RV64 Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 13/19] riscv: Add ifunc support for strlen Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 14/19] riscv: Add accelerated strlen routine Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 15/19] riscv: Add ifunc support for strcmp Christoph Muellner
2023-02-07  0:16 ` Christoph Muellner [this message]
2023-02-07 11:57   ` [RFC PATCH 16/19] riscv: Add accelerated strcmp routines Xi Ruoyao
2023-02-07 14:15     ` Christoph Müllner
2023-03-31  5:06       ` Jeff Law
2023-03-31 12:31         ` Adhemerval Zanella Netto
2023-03-31 14:30           ` Jeff Law
2023-03-31 14:48             ` Adhemerval Zanella Netto
2023-03-31 17:19               ` Palmer Dabbelt
2023-03-31 14:32       ` Jeff Law
2023-02-07  0:16 ` [RFC PATCH 17/19] riscv: Add ifunc support for strncmp Christoph Muellner
2023-02-07  0:16 ` [RFC PATCH 18/19] riscv: Add an optimized strncmp routine Christoph Muellner
2023-02-07  1:19   ` Noah Goldstein
2023-02-08 15:13     ` Philipp Tomsich
2023-02-08 17:55       ` Palmer Dabbelt
2023-02-08 19:48         ` Adhemerval Zanella Netto
2023-02-08 18:04       ` Noah Goldstein
2023-02-07  0:16 ` [RFC PATCH 19/19] riscv: Add __riscv_cpu_relax() to allow yielding in busy loops Christoph Muellner
2023-02-07  0:23   ` Andrew Waterman
2023-02-07  0:29     ` Christoph Müllner
2023-02-07  2:59 ` [RFC PATCH 00/19] riscv: ifunc support with optimized mem*/str*/cpu_relax routines Kito Cheng
2023-02-07 16:40 ` Adhemerval Zanella Netto
2023-02-07 17:16   ` DJ Delorie
2023-02-07 19:32     ` Philipp Tomsich
2023-02-07 21:14       ` DJ Delorie
2023-02-08 11:26         ` Christoph Müllner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230207001618.458947-17-christoph.muellner@vrull.eu \
    --to=christoph.muellner@vrull.eu \
    --cc=andrew@sifive.com \
    --cc=darius@bluespec.com \
    --cc=dj@redhat.com \
    --cc=heiko.stuebner@vrull.eu \
    --cc=jeffreyalaw@gmail.com \
    --cc=kito.cheng@sifive.com \
    --cc=libc-alpha@sourceware.org \
    --cc=palmer@dabbelt.com \
    --cc=philipp.tomsich@vrull.eu \
    --cc=vineetg@rivosinc.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).