From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-ej1-x635.google.com (mail-ej1-x635.google.com [IPv6:2a00:1450:4864:20::635]) by sourceware.org (Postfix) with ESMTPS id 017DF3858D39 for ; Wed, 8 Feb 2023 15:13:57 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 017DF3858D39 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=vrull.eu Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=vrull.eu Received: by mail-ej1-x635.google.com with SMTP id p26so51870298ejx.13 for ; Wed, 08 Feb 2023 07:13:56 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=vrull.eu; s=google; h=content-transfer-encoding:cc:to:subject:message-id:date:from :in-reply-to:references:mime-version:from:to:cc:subject:date :message-id:reply-to; bh=3pDLVJD83DuHJ7PPcIz88ZrLWNuKpbQZPGK5hDQFcsc=; b=dwcFmW9QW+92/qRNjp01n3d8DExIJ/zlsel/ptGWg7pKoKhFTsdHlg41Bg38qzL8VD Xtwt3AseTY/ktb+UOjEtpZWLDhO31vMlde+lyJqD8uOm6nCfibFA49hWFXHW44f2bjXE tfX8AXzUmi69bbPjqdYp4bmAFqoeZArZTx+Du+1DSPqhzp3F5yJgyJQ+0zP5OqlRnsZp Gy1EJA0mzrqVzQfVOxhpSP9oMjMzzQwSmb99cKchJTqA0VAQgEsji8nJ2dPgDJPmnSK5 PmKPI3SrzwzLca9mKW9l9YfI3iogtWfE/1BaLbmJDT0WFz75iriuyvasTB/RVUj1J2D5 7VnQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:cc:to:subject:message-id:date:from :in-reply-to:references:mime-version:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=3pDLVJD83DuHJ7PPcIz88ZrLWNuKpbQZPGK5hDQFcsc=; b=sNGRmF1TskygoXGU+wIJ/VuBzmbCsIFUcL6oG/pVBN4lU6Y/qopVH/gL2A1iUh9BJF Pgf9mec0FsMaa8iIoTVcE0uXcDtLmnMmmYvTBfP7M2qxQD/lHkW/KPYawn2I5Obivcqu GHg5K/Vfn5+d8S7Y1A7+zXNBxQWF0WnaeYsLBF7cfRoLE7VqHCuqbQ1Q+w+9LQQeunRF f9HnllAooM2x0D0dJ6RuOQnzz2jBfnQd0qVpTy8ec4+Cc7KIotReBSVF1FMmVmbliEhl WOOuJAVtem5v1yuAxpK/0RjNAy0BklIaZ/zFqCVORzq8Ku0QmyTVf8j64K+yaiqv0xyL aldQ== X-Gm-Message-State: AO0yUKXmBK9WHU5H0DWWGXl8tCi4qH2JGAIFyWWtxZ9gLR7G4YCCP4XR vE8XqGdXqzESUGgArVSlsvjzD9Iqt32juhAS3BW6wQ== X-Google-Smtp-Source: AK7set9e3CDLouxv5YttbzvkQBHQhpwDanYqx64HXaGCpC1RJPSjvBZRIkg09VAZORIqGvOJOQ5AQ2lgT2BKB4qDDos= X-Received: by 2002:a17:906:a1d6:b0:86f:ef27:3f81 with SMTP id bx22-20020a170906a1d600b0086fef273f81mr2039468ejb.56.1675869235568; Wed, 08 Feb 2023 07:13:55 -0800 (PST) MIME-Version: 1.0 References: <20230207001618.458947-1-christoph.muellner@vrull.eu> <20230207001618.458947-19-christoph.muellner@vrull.eu> In-Reply-To: From: Philipp Tomsich Date: Wed, 8 Feb 2023 16:13:44 +0100 Message-ID: Subject: Re: [RFC PATCH 18/19] riscv: Add an optimized strncmp routine To: Noah Goldstein Cc: Christoph Muellner , libc-alpha@sourceware.org, Palmer Dabbelt , Darius Rad , Andrew Waterman , DJ Delorie , Vineet Gupta , Kito Cheng , Jeff Law , Heiko Stuebner Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Spam-Status: No, score=-9.8 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,GIT_PATCH_0,JMQ_SPF_NEUTRAL,KAM_SHORT,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Tue, 7 Feb 2023 at 02:20, Noah Goldstein wrote= : > > On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner > wrote: > > > > From: Christoph M=C3=BCllner > > > > The implementation of strncmp() can be accelerated using Zbb's orc.b > > instruction. Let's add an optimized implementation that makes use > > of this instruction. > > > > Signed-off-by: Christoph M=C3=BCllner > > Not necessary, but imo performance patches should have at least some refe= rence > to the expected speedup versus the existing alternatives. Given that this is effectively a SWAR-like optimization (orc.b allows us to test 8 bytes in parallel for a NUL byte), we should be able to show the benefit through a reduction in dynamic instructions. Would this be considered reasonable reference data? > > --- > > sysdeps/riscv/multiarch/Makefile | 3 +- > > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 + > > sysdeps/riscv/multiarch/strncmp.c | 6 +- > > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++ > > 4 files changed, 127 insertions(+), 2 deletions(-) > > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S > > > > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch= /Makefile > > index 056ce2ffc0..9f22e31b99 100644 > > --- a/sysdeps/riscv/multiarch/Makefile > > +++ b/sysdeps/riscv/multiarch/Makefile > > @@ -14,5 +14,6 @@ sysdep_routines +=3D \ > > strcmp_generic \ > > strcmp_zbb \ > > strcmp_zbb_unaligned \ > > - strncmp_generic > > + strncmp_generic \ > > + strncmp_zbb > > endif > > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/= multiarch/ifunc-impl-list.c > > index eb37ed6017..82fd34d010 100644 > > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c > > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc= _ifunc_impl *array, > > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic)) > > > > IFUNC_IMPL (i, name, strncmp, > > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb) > > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic)) > > return i; > > } > > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarc= h/strncmp.c > > index 970aeb8b85..5b0fe08e98 100644 > > --- a/sysdeps/riscv/multiarch/strncmp.c > > +++ b/sysdeps/riscv/multiarch/strncmp.c > > @@ -30,8 +30,12 @@ > > > > extern __typeof (__redirect_strncmp) __libc_strncmp; > > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidde= n; > > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden; > > > > -libc_ifunc (__libc_strncmp, __strncmp_generic); > > +libc_ifunc (__libc_strncmp, > > + HAVE_RV(zbb) > > + ? __strncmp_zbb > > + : __strncmp_generic); > > > > # undef strncmp > > strong_alias (__libc_strncmp, strncmp); > > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/mult= iarch/strncmp_zbb.S > > new file mode 100644 > > index 0000000000..29cff30def > > --- /dev/null > > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S > > @@ -0,0 +1,119 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful= , > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library. If not, see > > + . */ > > + > > +#include > > +#include > > + > > +/* Assumptions: rvi_zbb. */ > > + > > +#define src1 a0 > > +#define result a0 > > +#define src2 a1 > > +#define len a2 > > +#define data1 a2 > > +#define data2 a3 > > +#define align a4 > > +#define data1_orcb t0 > > +#define limit t1 > > +#define fast_limit t2 > > +#define m1 t3 > > + > > +#if __riscv_xlen =3D=3D 64 > > +# define REG_L ld > > +# define SZREG 8 > > +# define PTRLOG 3 > > +#else > > +# define REG_L lw > > +# define SZREG 4 > > +# define PTRLOG 2 > > +#endif > > + > > +#ifndef STRNCMP > > +# define STRNCMP __strncmp_zbb > > +#endif > > + > > +.option push > > +.option arch,+zbb > > + > > +ENTRY_ALIGN (STRNCMP, 6) > > + beqz len, L(equal) > > + or align, src1, src2 > > + and align, align, SZREG-1 > > + add limit, src1, len > > + bnez align, L(simpleloop) > > + li m1, -1 > > + > > + /* Adjust limit for fast-path. */ > > + andi fast_limit, limit, -SZREG > > + > > + /* Main loop for aligned string. */ > > + .p2align 3 > > +L(loop): > > + bge src1, fast_limit, L(simpleloop) > > + REG_L data1, 0(src1) > > + REG_L data2, 0(src2) > > + orc.b data1_orcb, data1 > > + bne data1_orcb, m1, L(foundnull) > > + addi src1, src1, SZREG > > + addi src2, src2, SZREG > > + beq data1, data2, L(loop) > > + > > + /* Words don't match, and no null byte in the first > > + * word. Get bytes in big-endian order and compare. */ > > +#if __BYTE_ORDER__ =3D=3D __ORDER_LITTLE_ENDIAN__ > > + rev8 data1, data1 > > + rev8 data2, data2 > > +#endif > > + /* Synthesize (data1 >=3D data2) ? 1 : -1 in a branchless seque= nce. */ > > + sltu result, data1, data2 > > + neg result, result > > + ori result, result, 1 > > + ret > > + > > +L(foundnull): > > + /* Found a null byte. > > + * If words don't match, fall back to simple loop. */ > > + bne data1, data2, L(simpleloop) > > + > > + /* Otherwise, strings are equal. */ > > + li result, 0 > > + ret > > + > > + /* Simple loop for misaligned strings. */ > > + .p2align 3 > > +L(simpleloop): > > + bge src1, limit, L(equal) > > + lbu data1, 0(src1) > > + addi src1, src1, 1 > > + lbu data2, 0(src2) > > + addi src2, src2, 1 > > + bne data1, data2, L(sub) > > + bnez data1, L(simpleloop) > > + > > +L(sub): > > + sub result, data1, data2 > > + ret > > + > > +L(equal): > > + li result, 0 > > + ret > > + > > +.option pop > > + > > +END (STRNCMP) > > +libc_hidden_builtin_def (STRNCMP) > > -- > > 2.39.1 > >