From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-ej1-x635.google.com (mail-ej1-x635.google.com [IPv6:2a00:1450:4864:20::635]) by sourceware.org (Postfix) with ESMTPS id D33E53858D1E for ; Tue, 7 Feb 2023 01:20:11 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org D33E53858D1E Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com Received: by mail-ej1-x635.google.com with SMTP id hr39so9772806ejc.7 for ; Mon, 06 Feb 2023 17:20:11 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:cc:to:subject:message-id:date:from :in-reply-to:references:mime-version:from:to:cc:subject:date :message-id:reply-to; bh=S1vdpoNDlviXkBVMU/YXxWFE/pJTIo/sWPUZmnvSSPM=; b=BCDnD7MVnPyljNjg4wTACuujNX0ywpiXyJUm10JuftzId2q7RyLLAimAUYiRyUxTTJ OdW7/8HD5fpxLZdq/ALTAjbZNN6KEtVzNNZl98jjb05rCddRVaIJgkm5HlEkhq7WXoi8 uXePJkVtUPkbdyJpE3209oQD7nMtowvlRGeJgdIlEFhcocxwCT6Uzq3yVzvMeUHt7AeN JYIBTZP73Kd+dRAhw0Pn9GfzHp58ad/riGjOKmrCML8ThDAl36izxI4Y1pClAWbi7wjV MhLrzlMDSjPz7v4Ze4aQ1HPJPNhptfBG4bpr0lBm2AYrE6ZjnrjbMM3lxdX6SmhOa/q/ mlNQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:cc:to:subject:message-id:date:from :in-reply-to:references:mime-version:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=S1vdpoNDlviXkBVMU/YXxWFE/pJTIo/sWPUZmnvSSPM=; b=hRFwuDRe1rwiSZP1E9NsyTYF8YnPg6Xiy3UUSJiM0Faeddl5lVjIuW6c4qF7kGwofJ wTu9vJ6NzrED9Jr1fTBojA8uwaVT6Nev12rNn9ipX+LuS14inLnvcmEdmBLn6kzIyexn DCQhu7WBbRjadIDC6qLD/O4/v2n4rY6XL/J34LrpCT2UHOVeJ/loIgK1slJ+78t4q1Yi AC/TkD02pYK3JIz9ZQw5wB6K+QG5aE4zNiJWFVde1iJF6af8EtjIGCGqtD9PrLcypkDz SbDCH+yldA3FPxVb6SqTBDmprkTOM1P/5O/02UYIXWAD7JBl9GhLLfs9bKD92EfuAsrB dySA== X-Gm-Message-State: AO0yUKVQnUn0z1g/hIFuT39bMe725dst1Ynu5JPPlyBp+3O2fsnXVYio swgq0Ll3kIzhn8iu2E+5JTZp1YIiQB3KhpmJb6k= X-Google-Smtp-Source: AK7set/GfUB5+5BueO9qcnIsB9Dm7IF2QB3UfqSXCCG+2K2kKGjFTY3JEW0unUVda2XVhkuLYZS18gkXlgLbfnp2hNQ= X-Received: by 2002:a17:906:b1d0:b0:87b:db55:f3e5 with SMTP id bv16-20020a170906b1d000b0087bdb55f3e5mr357888ejb.289.1675732810380; Mon, 06 Feb 2023 17:20:10 -0800 (PST) MIME-Version: 1.0 References: <20230207001618.458947-1-christoph.muellner@vrull.eu> <20230207001618.458947-19-christoph.muellner@vrull.eu> In-Reply-To: <20230207001618.458947-19-christoph.muellner@vrull.eu> From: Noah Goldstein Date: Mon, 6 Feb 2023 19:19:59 -0600 Message-ID: Subject: Re: [RFC PATCH 18/19] riscv: Add an optimized strncmp routine To: Christoph Muellner Cc: libc-alpha@sourceware.org, Palmer Dabbelt , Darius Rad , Andrew Waterman , DJ Delorie , Vineet Gupta , Kito Cheng , Jeff Law , Philipp Tomsich , Heiko Stuebner Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Spam-Status: No, score=-9.7 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,GIT_PATCH_0,KAM_SHORT,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner wrote: > > From: Christoph M=C3=BCllner > > The implementation of strncmp() can be accelerated using Zbb's orc.b > instruction. Let's add an optimized implementation that makes use > of this instruction. > > Signed-off-by: Christoph M=C3=BCllner Not necessary, but imo performance patches should have at least some refere= nce to the expected speedup versus the existing alternatives. > --- > sysdeps/riscv/multiarch/Makefile | 3 +- > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 + > sysdeps/riscv/multiarch/strncmp.c | 6 +- > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++ > 4 files changed, 127 insertions(+), 2 deletions(-) > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S > > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/M= akefile > index 056ce2ffc0..9f22e31b99 100644 > --- a/sysdeps/riscv/multiarch/Makefile > +++ b/sysdeps/riscv/multiarch/Makefile > @@ -14,5 +14,6 @@ sysdep_routines +=3D \ > strcmp_generic \ > strcmp_zbb \ > strcmp_zbb_unaligned \ > - strncmp_generic > + strncmp_generic \ > + strncmp_zbb > endif > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/mu= ltiarch/ifunc-impl-list.c > index eb37ed6017..82fd34d010 100644 > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_i= func_impl *array, > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic)) > > IFUNC_IMPL (i, name, strncmp, > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb) > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic)) > return i; > } > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/= strncmp.c > index 970aeb8b85..5b0fe08e98 100644 > --- a/sysdeps/riscv/multiarch/strncmp.c > +++ b/sysdeps/riscv/multiarch/strncmp.c > @@ -30,8 +30,12 @@ > > extern __typeof (__redirect_strncmp) __libc_strncmp; > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden; > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden; > > -libc_ifunc (__libc_strncmp, __strncmp_generic); > +libc_ifunc (__libc_strncmp, > + HAVE_RV(zbb) > + ? __strncmp_zbb > + : __strncmp_generic); > > # undef strncmp > strong_alias (__libc_strncmp, strncmp); > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multia= rch/strncmp_zbb.S > new file mode 100644 > index 0000000000..29cff30def > --- /dev/null > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S > @@ -0,0 +1,119 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + . */ > + > +#include > +#include > + > +/* Assumptions: rvi_zbb. */ > + > +#define src1 a0 > +#define result a0 > +#define src2 a1 > +#define len a2 > +#define data1 a2 > +#define data2 a3 > +#define align a4 > +#define data1_orcb t0 > +#define limit t1 > +#define fast_limit t2 > +#define m1 t3 > + > +#if __riscv_xlen =3D=3D 64 > +# define REG_L ld > +# define SZREG 8 > +# define PTRLOG 3 > +#else > +# define REG_L lw > +# define SZREG 4 > +# define PTRLOG 2 > +#endif > + > +#ifndef STRNCMP > +# define STRNCMP __strncmp_zbb > +#endif > + > +.option push > +.option arch,+zbb > + > +ENTRY_ALIGN (STRNCMP, 6) > + beqz len, L(equal) > + or align, src1, src2 > + and align, align, SZREG-1 > + add limit, src1, len > + bnez align, L(simpleloop) > + li m1, -1 > + > + /* Adjust limit for fast-path. */ > + andi fast_limit, limit, -SZREG > + > + /* Main loop for aligned string. */ > + .p2align 3 > +L(loop): > + bge src1, fast_limit, L(simpleloop) > + REG_L data1, 0(src1) > + REG_L data2, 0(src2) > + orc.b data1_orcb, data1 > + bne data1_orcb, m1, L(foundnull) > + addi src1, src1, SZREG > + addi src2, src2, SZREG > + beq data1, data2, L(loop) > + > + /* Words don't match, and no null byte in the first > + * word. Get bytes in big-endian order and compare. */ > +#if __BYTE_ORDER__ =3D=3D __ORDER_LITTLE_ENDIAN__ > + rev8 data1, data1 > + rev8 data2, data2 > +#endif > + /* Synthesize (data1 >=3D data2) ? 1 : -1 in a branchless sequenc= e. */ > + sltu result, data1, data2 > + neg result, result > + ori result, result, 1 > + ret > + > +L(foundnull): > + /* Found a null byte. > + * If words don't match, fall back to simple loop. */ > + bne data1, data2, L(simpleloop) > + > + /* Otherwise, strings are equal. */ > + li result, 0 > + ret > + > + /* Simple loop for misaligned strings. */ > + .p2align 3 > +L(simpleloop): > + bge src1, limit, L(equal) > + lbu data1, 0(src1) > + addi src1, src1, 1 > + lbu data2, 0(src2) > + addi src2, src2, 1 > + bne data1, data2, L(sub) > + bnez data1, L(simpleloop) > + > +L(sub): > + sub result, data1, data2 > + ret > + > +L(equal): > + li result, 0 > + ret > + > +.option pop > + > +END (STRNCMP) > +libc_hidden_builtin_def (STRNCMP) > -- > 2.39.1 >