From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <SRS0=lmAV=55=gmail.com=goldstein.w.n@sourceware.org>
Received: from mail-yw1-x112f.google.com (mail-yw1-x112f.google.com [IPv6:2607:f8b0:4864:20::112f])
	by sourceware.org (Postfix) with ESMTPS id 09DEC3858D33
	for <libc-alpha@sourceware.org>; Wed,  1 Feb 2023 18:11:16 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 09DEC3858D33
Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com
Received: by mail-yw1-x112f.google.com with SMTP id 00721157ae682-4a263c4ddbaso257927187b3.0
        for <libc-alpha@sourceware.org>; Wed, 01 Feb 2023 10:11:16 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=cc:to:subject:message-id:date:from:in-reply-to:references
         :mime-version:from:to:cc:subject:date:message-id:reply-to;
        bh=cIR4mVHKEkjz+JCgMcyvP1bL9lEJ9Z9eFzi/AkKhE/U=;
        b=gOdtfs6JaL2JTR9e0mxg20peQDACGEGxOm2i6LWWcOcQp1xlZLUdx6D4nd816rhkKU
         a7g/ooG/IudNEFkJDL2oQavmu+kiUNuMhNHAPnmRkEHSbHRRM08Qh3b26BxjrfuxjjHs
         nccVLwQ82y6NaVh3m7bnZXRZxFYsuHc8P/mjAlxCSVVG2wiCO5m2T58KLMFPC1/3SCpw
         OjTkBcjPmbHxJhuU++uifA3fv1FAcbI2sqPcMqME0/9vGDxj3+BMnABZX/H46MBjLLRH
         PPbqFugpPz8u6UAcY3FWIyt+NOXqAEz21ozE6RGS/pT27Zn8tzEmSjcl5KazUgBZRlAg
         ssjA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=cc:to:subject:message-id:date:from:in-reply-to:references
         :mime-version:x-gm-message-state:from:to:cc:subject:date:message-id
         :reply-to;
        bh=cIR4mVHKEkjz+JCgMcyvP1bL9lEJ9Z9eFzi/AkKhE/U=;
        b=2PaRIIRlCEyWENfKnEKZGoIU2LIf0NgMpea13zEsBwEFfqAIOPqEP1GWCQfEIhsQLb
         xVN8a9WMweP7WikmFt5cNXZfzPnFAvz0WVJ8osAIWqHhiLLWtGKxTDgbU1E7YGHgx1V3
         uELBzj4o6LtDaO87T9GYmli4V4Zh/BCEZT26z7W8VVsdmSAPtRUtGX1JniGFlEchHDuk
         ycxO4mu366BBifxjxEbn+LfRzfHqVf+kGr/ag4Dsr+vU1aW3Ro1pC9SMPJ08kzpvTtsn
         2a/X1lJk2Hh8X2zPXiBb4X2WtRXMLJvrmlW/6rx6LwN6htP7+iUwyyrYpTg3kVWZgErh
         /6QQ==
X-Gm-Message-State: AO0yUKX1flAmULR6ApPeXT5EojaVKEskHaWx7e2WNO+gpEdjNSmxiIDI
	zKb53SLaNCqTq+7ZM1AWtEG111jLxzpQFdlKRCg=
X-Google-Smtp-Source: AK7set95J8LXXojiFcQIS9po0+JK7CxrM1yVH3eq0b5ZEgreUxILRdXFclDIbJMiHEue0vrx714Oqa/DqNYeGXkFRkI=
X-Received: by 2002:a81:204:0:b0:4db:df79:a7c7 with SMTP id
 4-20020a810204000000b004dbdf79a7c7mr412111ywc.515.1675275074817; Wed, 01 Feb
 2023 10:11:14 -0800 (PST)
MIME-Version: 1.0
References: <20230201095232.15942-1-slewis@rivosinc.com> <20230201095232.15942-2-slewis@rivosinc.com>
In-Reply-To: <20230201095232.15942-2-slewis@rivosinc.com>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 1 Feb 2023 12:11:01 -0600
Message-ID: <CAFUsyfKN0bsY1-ZQ_BK1==ThwVOzWZOoNGbBMu0-e1+nRZsLeg@mail.gmail.com>
Subject: Re: [PATCH 2/2] riscv: vectorised mem* and str* functions
To: Sergei Lewis <slewis@rivosinc.com>
Cc: libc-alpha@sourceware.org
Content-Type: text/plain; charset="UTF-8"
X-Spam-Status: No, score=-8.4 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,GIT_PATCH_0,KAM_SHORT,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <libc-alpha.sourceware.org>

On Wed, Feb 1, 2023 at 3:54 AM Sergei Lewis <slewis@rivosinc.com> wrote:
>
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.

There should probably be a mention of performance gains vs the generic
implementations in the commit message to justify this.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
>  sysdeps/riscv/rv64/rvv/Implies     |   2 +
>  sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>  sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>  sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>  sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>  sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>  sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
>  17 files changed, 1428 insertions(+)
>  create mode 100644 sysdeps/riscv/rv64/rvv/Implies
>  create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + *    - cpu becomes bandwidth limited at or before
> + *            2 vector register sized read/write operations
> + *          + 2 scalar operations
> + *          + conditional branch
> + */
> +
> +.globl  memchr
> +.type   memchr,@function
> +
> +.align    2
> +memchr:
> +    beqz a2, .Lnot_found
> +    csrr    t1, vlenb
> +    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're scanning
> +                                       at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align
> +    li a3, 8
> +    blt a2, a3, .Lbytewise
> +
> +    li      t1, 0x0101010101010101
> +    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
> +    mul     t2, a1, t1    /* entirety of t2 is now repeats of target character;
> +                             assume mul is at worst no worse than 3*(shift+OR),
> +                             otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + *      - now t4 contains zero bytes if and only if next word of memory
> + *        had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> +    ld t4, (a0)          /* t4 = load next 8 bytes */
> +    xor t4, t4, t2
> +    sub t5, t4, t1
> +    not t4, t4
> +    and t4, t5, t4
> +    and t4, t4, a4
> +    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary chop
> +                            to locate byte of interest in t4 but profiling
> +                            shows these approaches are at best no better */
> +    addi a2, a2, -8
> +    addi a0, a0, 8
> +    bgeu a2, a3, 1b
> +    beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> +   make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> +    add t2, a0, a2
> +
> +1:
> +    lb t1, (a0)
> +    beq t1, a1, .Lfound
> +    addi a0, a0, 1
> +    blt a0, t2, 1b
> +
> +.Lnot_found:
> +    mv a0, zero
> +.Lfound:
> +    ret
> +
> +.Lvector_path:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    add         a0, a0, t2
> +    sub         a2, a2, t2
> +    bge         a2, t2, 1b
> +    bnez        a2, 2f
> +    mv          a0, zero
> +    ret
> +
> +2:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    mv          a0, zero
> +    ret
> +
> +.Lvec_found:
> +    add a0, a0, t3
> +    ret
> +
> +.size   memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcmp.S b/sysdeps/riscv/rv64/rvv/memcmp.S
> new file mode 100644
> index 0000000000..a945753a5f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcmp.S
> @@ -0,0 +1,93 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcmp for riscv with vector extension
> + */
> +
> +.globl  memcmp
> +.type   memcmp,@function
> +
> +.align    2
> +
> +memcmp:
> +    mv t2, zero
> +    beqz a2, .Ldone
> +
> +    li          t1, 5            /* scalar path cheaper for 1-4 elts */
> +    bltu        a2, t1, .Lscalar
> +
> +    /* main loop, vlenb*2 elts at a time */
> +    vsetvli     t1, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)        /* load elts */
> +    vle8.v      v4, (a1)
> +    vmsne.vv    v0, v2, v4      /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_diff  /* found a difference ? */
> +    add         a0, a0, t1      /* not yet, advance everything */
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bgeu        a2, t1, 1b
> +
> +    bnez        a2, .Ltail
> +    mv          a0, zero
> +    ret
> +
> +.Ltail:
> +    /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
> +    vsetvli     t1, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vle8.v      v4, (a1)
> +    vmsne.vv    v0, v2, v4
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_diff
> +    mv          a0, zero         /* no diff found */
> +    ret
> +
> +.Lvec_diff:         /* v2, v4 differ at elt t3 */
> +    add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    sub a0, t0, t1
> +    ret
> +
> +.Lscalar:
> +    add  t3, a0, a2
> +
> +1:
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    sub t2, t0, t1
> +    bnez t2, .Ldone
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bltu a0, t3, 1b
> +
> +.Ldone:
> +    mv a0, t2
> +    ret
> +
> +
> +.size memcmp, .-memcmp
> +libc_hidden_builtin_def (memcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcpy.S b/sysdeps/riscv/rv64/rvv/memcpy.S
> new file mode 100644
> index 0000000000..7b37ec285d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcpy.S
> @@ -0,0 +1,154 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcpy and memmove for riscv with vector extension
> + */
> +
> +.globl  memcpy
> +.type   memcpy,@function
> +.globl  memmove
> +.type   memmove,@function
> +
> +.align    2
> +memmove:
> +    bge     a0, a1, .Lmemcpy_rev
> +
> +memcpy:
> +.Lmemcpy_fwd:
> +    mv      t0, a0          /* t0 = preserve a0 so we can return it */
> +    csrr    t2, vlenb       /* t2 = number of bytes per vectorised copy op */
> +    slli    t5, t2,  1      /* t5 = number of bytes per loop */
> +    addi    t3, t5, -1      /* generate mask */
> +    not     t4, t3
> +    and     t4, a2, t4      /* t4 = bytes copied in vectorised pass */
> +
> +    beqz    t4, .Lscalar_fwd    /* size too small for even one pass? */
> +
> +    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
> +    add    t4, t4, a1           /* t4 = src at end of vectorised pass */
> +
> +1:
> +    vl2r.v  v2, (a1)            /* load, advance source */
> +    add     a1, a1, t5
> +    vs2r.v  v2, (t0)            /* store, advance dest */
> +    add     t0, t0, t5
> +    bltu    a1, t4, 1b          /* src at end? */
> +
> +    bltu    a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
> +    vl1r.v  v2, (a1)
> +    sub     a2, a2, t2
> +    add     a1, a1, t2
> +    vs1r.v  v2, (t0)
> +    add     t0, t0, t2
> +
> +.Lscalar_fwd:
> +    bnez    a2, .Lnobail
> +.Lbail:
> +    ret
> +.Lnobail:
> +
> +#ifndef __riscv_strict_align
> +    addi    t2, zero, 4
> +    bltu    a2, t2, .Lsingle_bytes
> +1:
> +    lw      t3, 0(a1)
> +    addi    a1, a1, 4
> +    sw      t3, 0(t0)
> +    addi    t0, t0, 4
> +    addi    a2, a2, -4
> +    bgeu    a2, t2, 1b
> +#endif // __riscv_strict_align
> +
> +.Lsingle_bytes:
> +    beqz    a2, .Lbail
> +    add     a2, a2, a1          /* a2 = src + remaining size */
> +1:
> +    lb      t1, 0(a1)
> +    sb      t1, 0(t0)
> +    addi    a1, a1, 1
> +    addi    t0, t0, 1
> +    bltu    a1, a2, 1b
> +    ret
> +.size   memcpy,     .-memcpy
> +
> +
> +.Lmemcpy_rev:
> +    beq     a0, a1, .Lmemcpy_rev_done
> +    add     t0, a0, a2          /* t0 = dest so we can return a0=dest later */
> +    add     t6, a1, a2          /* dest and src both point to byte */
> +                                /* immediately after end of buffer */
> +
> +    csrr    t2, vlenb           /* t2 = number of bytes per pass */
> +    slli    t5, t2,  1          /* t5 = number of bytes per entire loop */
> +    addi    t3, t5, -1          /* t3 = (bytes per loop) mask */
> +    not     t4, t3              /* generate mask for bytes processed by loop */
> +    and     t4, a2, t4          /* t4 = bytes copied in vectorised pass */
> +
> +    beqz    t4, .Lscalar_rev    /* size too small for even one pass? */
> +
> +    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
> +    sub    t4, t6, t4           /* t4 = src at end of vectorised pass */
> +
> +1:
> +    sub     t6, t6, t5
> +    sub     t0, t0, t5
> +    vl2r.v  v2, (t6)            /* load, advance source */
> +    vs2r.v  v2, (t0)            /* store, advance dest */
> +    bgtu    t6, t4, 1b          /* src at end? */
> +
> +    bltu    a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
> +    sub     t6, t6, t2
> +    sub     t0, t0, t2
> +    sub     a2, a2, t2
> +    vl1r.v  v2, (t6)
> +    vs1r.v  v2, (t0)
> +
> +.Lscalar_rev:
> +#ifndef __riscv_strict_align
> +    beqz    a2, .Lbail
> +
> +    addi    t2, zero, 4
> +    bltu    a2, t2, 2f
> +1:
> +    addi    t6, t6, -4
> +    addi    t0, t0, -4
> +    addi    a2, a2, -4
> +    lw      t3, 0(t6)
> +    sw      t3, 0(t0)
> +    bgeu    a2, t2, 1b
> +2:
> +#endif // __riscv_strict_align
> +
> +    beqz    a2, .Lbail
> +1:
> +    addi    t6, t6, -1
> +    addi    t0, t0, -1
> +    lb      t1, 0(t6)
> +    sb      t1, 0(t0)
> +    bgtu    t0, a0, 1b
> +
> +.Lmemcpy_rev_done:
> +    ret
> +
> +.size   memmove, .-memmove
> +libc_hidden_builtin_def (memcpy)
> +libc_hidden_builtin_def (memmove)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memmove.c b/sysdeps/riscv/rv64/rvv/memmove.c
> new file mode 100644
> index 0000000000..47734854f9
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memmove.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* memmove is implemented in memcpy.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/memset.S b/sysdeps/riscv/rv64/rvv/memset.S
> new file mode 100644
> index 0000000000..6f82c542b1
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memset.S
> @@ -0,0 +1,89 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memset for riscv with vector extension
> + */
> +
> +.globl  memset
> +.type   memset,@function
> +
> +.align    2
> +memset:
> +    mv      t0, a0                  /* t0 = dest so we can return a0 later */
> +    vsetvli t2, a2, e8, m2, ta, ma  /* t2 = elts per copy */
> +    beqz    t2, .Lscalar
> +
> +    vmv.v.x v2, a1                  /* splat value across v2 */
> +
> +    slli    t3, t2, 1
> +    bgtu    t3, a2, .Lsinglestore
> +
> +1:
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    sub     a2, a2, t3
> +    bgeu    a2, t3, 1b
> +    bgeu    a2, t2, .Lsinglestore
> +    bnez    a2, .Lscalar
> +
> +.Lbail:
> +    ret
> +
> +.Lsinglestore:
> +    bgtu    t2, a2, .Lscalar
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    sub     a2, a2, t2
> +
> +.Lscalar:
> +    beqz    a2, .Lbail
> +
> +#ifndef __riscv_strict_align
> +    slli    t2, a1, 8
> +    or      a1, a1, t2
> +    slli    t2, a1, 16
> +    or      a1, a1, t2
> +
> +    addi    t2, zero, 4
> +    bltu    a2, t2, 2f
> +
> +1:
> +    sw      a1, 0(t0)
> +    addi    t0, t0, 4
> +    addi    a2, a2, -4
> +    bgeu    a2, t2, 1b
> +2:
> +    beqz    a2, .Lbail
> +#endif // __riscv_strict_align
> +
> +    add     a2, a2, t0
> +1:
> +    sb      a1, 0(t0)
> +    addi    t0, t0, 1
> +    bltu    t0, a2, 1b
> +    ret
> +
> +.size   memset,     .-memset
> +libc_hidden_builtin_def (memset)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchr.S b/sysdeps/riscv/rv64/rvv/strchr.S
> new file mode 100644
> index 0000000000..0b37174c55
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchr.S
> @@ -0,0 +1,92 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strchr
> +.type   strchr,@function
> +
> +.globl  __strchrnul
> +.type   __strchrnul,@function
> +
> +/*
> + *  optimized strchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +__strchrnul:
> +    li t5, -1
> +    j  1f
> +
> +strchr:
> +    mv          t5, zero
> +1:  csrr        t1, vlenb              /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1             /* mask off unaligned part of pointer */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2             /* search however many bytes
> +                                          are needed to align the pointer */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (a0)               /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    bgez        t4, .Lbufferend
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    li          t4, -1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    bgez        t4, .Lbufferend
> +    add         a0, a0, t1
> +    j           1b
> +
> +.Lfound:                                    /* found the target at a0+t3 */
> +    add         a0, a0, t3
> +    ret
> +
> +.Lbufferend:
> +    add         a0, a0, t4
> +    and         a0, a0, t5
> +    ret
> +
> +.size   strchr, .-strchr
> +.size   __strchrnul, .-__strchrnul
> +
> +libc_hidden_builtin_def (strchr)
> +weak_alias (__strchrnul, strchrnul)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchrnul.c b/sysdeps/riscv/rv64/rvv/strchrnul.c
> new file mode 100644
> index 0000000000..259da80358
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchrnul.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strchrnul is implemented in strchr.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strcmp.S b/sysdeps/riscv/rv64/rvv/strcmp.S
> new file mode 100644
> index 0000000000..4a219221ac
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcmp.S
> @@ -0,0 +1,108 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strcmp
> +.type   strcmp,@function
> +
> +.align    2
> +
> +/* most of the time, one or both sides is unaligned and their alignments differ
> + * we need to check for a null terminator before crossing a page boundary
> + * strategy:
> + * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
> + * - while no difference encountered:
> + * -   for each side:
> + * -       load bytes to end of next vlenb*2 block
> + * -       check for null terminator
> + * -       if no terminator, load bytes to fill rest of register
> + * -   compare sides
> + */
> +
> +strcmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1         /* mask for unaligned part of ptr */
> +    and         t6, a0, t2         /* unaligned part of lhs */
> +    and         t5, a1, t2         /* unaligned part of rhs */
> +    sub         t6, t1, t6         /* safe number of lhs bytes to read */
> +    sub         t5, t1, t5         /* same, rhs */
> +    vmsltu.vx   v28, v30, t6       /* v28 = mask for first half of lhs load */
> +    vmsltu.vx   v26, v30, t5       /* v26 = mask for first half of rhs load */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +1:  vmv.v.v     v0, v28              /* lhs mask */
> +    vle8.v      v2, (a0), v0.t       /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t  /* check loaded bytes for null */
> +    vmv.v.v     v0, v26              /*       rhs mask */
> +    vfirst.m    t2, v16              /* get lhs check result */
> +    bgez        t2, .Ltail           /* bail if we can't safely check rest */
> +    vle8.v      v4, (a1), v0.t       /*    masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t  /*    check partial rhs for null */
> +    vmnot.m     v0, v28              /* mask for rest of lhs */
> +    vfirst.m    t3, v18              /* get check result */
> +    bltz        t3, 2f               /* test it */
> +                                     /* we see null terminator */
> +    bge         t3, t6, .Ltail       /* have enough bytes for vector cmp? */
> +
> +    vmsleu.vx   v0, v30, t3          /* select rest + null */
> +    vmsne.vv    v0, v2, v4, v0.t     /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero             /* no difference */
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator */
> +2:  vle8.v      v2, (a0), v0.t       /* load rest of lhs */
> +    vmnot.m     v0, v26              /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t       /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4           /* compare */
> +    add         a0, a0, t1           /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +    mv a0, zero
> +    ret
> +
> +
> +.size strcmp, .-strcmp
> +libc_hidden_builtin_def (strcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strcpy
> +.type   strcpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strcpy:
> +    mv          t0, a0                     /* copy dest so we can return it */
> +
> +    csrr        t1, vlenb                  /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                 /* mask unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                 /* search enough to align ptr */
> +    vsetvli     t2, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                     /* copy but not past null */
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Ldone
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    bltz        t3, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strlen
> +.type   strlen,@function
> +
> +/*
> + *  optimized strlen for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strlen:
> +    mv          t4, a0                    /* copy of buffer start */
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search fwd to align ptr */
> +    vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per pass */
> +    add         t4, t4, t1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    add         a0, a0, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bltz        t3, 1b
> +
> +.Lfound:                                  /* found the 0; subtract          */
> +    sub         a0, a0, t4                /* buffer start from current ptr  */
> +    add         a0, a0, t3                /* and add offset into fetched    */
> +    ret                                   /* data to get length */
> +
> +.size   strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncmp
> +.type   strncmp,@function
> +
> +.align    2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2 */
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1                  /* mask unaligned part of ptr */
> +    and         t6, a0, t2                  /* unaligned part of lhs */
> +    and         t5, a1, t2                  /* unaligned part of rhs */
> +    sub         t6, t1, t6                  /* safe count to read from lhs */
> +    sub         t5, t1, t5                  /* same, rhs */
> +    vmsltu.vx   v28, v30, t6                /* mask for first part of lhs */
> +    vmsltu.vx   v26, v30, t5                /* mask for first part of rhs */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +
> +1:  blt         a2, t1, .Ltail
> +    vmv.v.v     v0, v28                     /* lhs mask */
> +    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for null */
> +    vmv.v.v     v0, v26                     /*       rhs mask */
> +    vfirst.m    t2, v16                     /* get lhs check result */
> +    bgez        t2, .Ltail                  /* can we safely check rest */
> +    vle8.v      v4, (a1), v0.t              /*       masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs */
> +    vmnot.m     v0, v28                     /* mask for rest of lhs */
> +    vfirst.m    t3, v18                     /* get check result */
> +    bltz        t3, 2f                      /* test it */
> +    bge         t3, t6, .Ltail
> +
> +    vmsleu.vx   v0, v30, t3                 /* select rest of string + null */
> +    vmsne.vv    v0, v2, v4, v0.t            /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator in first part of lhs or rhs */
> +2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
> +    vmnot.m     v0, v26                     /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4                  /* compare */
> +    add         a0, a0, t1                  /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1  /* found a diff but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    beqz a2, 1f
> +    addi a2, a2, -1
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +1:  mv a0, zero
> +    ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncpy
> +.type   strncpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strncpy:
> +    mv          t0, a0                    /* need to return dest so copy */
> +
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search to align the pointer */
> +    vsetvli     zero, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                    /* copy to dest */
> +    vfirst.m    t3, v4
> +    bgeu        t2, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Lterminator_found
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +    sub         a2, a2, t2
> +    beqz        a2, .Ldone
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    bgeu        t1, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +    sub         t0, t0, t1
> +
> +.Lterminator_found:
> +    addi        sp, sp, -16
> +    sd          ra, 0(sp)
> +    sd          a0, 8(sp)
> +    add         a0, t0, t3
> +    mv          a1, zero
> +    sub         a2, a2, t3
> +    jal         ra, memset
> +    ld          ra, 0(sp)
> +    ld          a0, 8(sp)
> +    addi        sp, sp, 16
> +.Ldone:
> +    ret
> +
> +.Ldest_full:
> +    vid.v       v6
> +    vmsltu.vx   v4, v6, a2
> +    vmand.mm    v0, v0, v4
> +    vse8.v      v2, (t0), v0.t
> +    ret
> +
> +.size   strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  __strnlen
> +.type   __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align    2
> +__strnlen:
> +    mv          t4, a0               /* stash a copy of start for later */
> +    beqz        a1, .LzeroCount
> +
> +    csrr        t1, vlenb            /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1           /* mask off unaligned part of ptr */
> +    and         t2, a1, a0
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2           /* search to align pointer to t1 */
> +    bgeu        t2, a1, 2f           /* check it's safe */
> +    mv          t2, a1               /* it's not! look as far as permitted */
> +2:  vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +    sub         a1, a1, t2
> +    bltu        a1, t1, .LreachedCount
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (a0)
> +    sub         a1, a1, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t1
> +    bgeu        a1, t1, 1b
> +.LreachedCount:
> +    mv          t2, a1    /* in case 0 < a1 < t1 */
> +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> +    sub         a0, a0, t4
> +    ret
> +
> +.Lfound:        /* found the 0; subtract buffer start from current pointer */
> +    add         a0, a0, t3 /* and add offset into fetched data */
> +    sub         a0, a0, t4
> +    ret
> +
> +.size   __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strrchr
> +.type   strrchr,@function
> +
> +/*
> + *  optimized strrchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strrchr:
> +    mv          t5, a0    /* stash buffer ptr somewhere safe */
> +    mv          a0, zero  /* result is nullptr unless we find better below */
> +
> +    csrr        t1, vlenb                /* determine vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1               /* mask off unaligned part of ptr */
> +    and         t2, t5, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2               /* search to align ptr to 2*vlenb */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero             /* check for null terminator */
> +    vfirst.m    t4, v4                   /* grab its position, if any */
> +    vmsbf.m     v0, v4                   /* select valid chars */
> +    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
> +    vfirst.m    t3, v0                   /* grab its position, if any */
> +    bltz        t3, 2f                   /* did we find a candidate? */
> +
> +3:  add         a0, t3, t5               /* we did! grab the address */
> +    vmsof.m     v1, v0                   /* there might be more than one */
> +    vmandn.mm   v0, v0, v1               /* so clear the one we just found */
> +    vfirst.m    t3, v0                   /* is there another? */
> +    bgez        t3, 3b
> +
> +2:  bgez        t4, .Ldone               /* did we see a null terminator? */
> +    add         t5, t5, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (t5)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bltz        t3, 2f
> +
> +3:  add         a0, t3, t5
> +    vmsof.m     v1, v0
> +    vmandn.mm   v0, v0, v1
> +    vfirst.m    t3, v0
> +    bgez        t3, 3b
> +
> +2:  add         t5, t5, t1
> +    bltz        t4, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strspn
> +.type   strspn,@function
> +
> +.globl  strcspn
> +.type   strcspn,@function
> +
> +/*
> + *  optimized strspn / strcspn for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 32
> + *  strategy:
> + *  - build a 256-bit table on the stack, where each elt is zero
> + *    if encountering it should terminate computation and nonzero otherwise
> + *  - use vectorised lookups into this to check 2*vlen elts at a time;
> + *    this code is identical for strspan and strcspan and can be shared
> + *
> + *  note that while V mandates at least 128 bit wide regs,
> + *  we are building a 256 bit lookup table
> + *  therefore we use either LMUL=1 or 2 depending on what the target supports
> + *  therefore we only use even vector register numbers,
> + *  so everything still works if we go with LMUL=2
> + */
> +
> +# -----------------------------
> +
> +.align    2
> +
> +strspn:
> +    lbu         t0, 0(a1)
> +    bnez        t0, .Lbuild_table
> +    mv          a0, zero
> +    ret
> +
> +.Lbuild_table:
> +    mv          a6, a0 /* store incoming a0 */
> +    li          t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    /* we want to build a 256-bit table, so use vlenb*2,
> +     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> +     * 'V' extension specifies a minimum vlen of 128 so this should cover
> +     * all cases; we can skip the check if we know vlen >= 256 at compile time
> +     */
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    /* read one char from the charset at a time and write the correct bit
> +     * in the lookup table; we could do SIMD iff we ever get an extension
> +     * that provides some way of scattering bytes into a reg group
> +     */
> +    vmv.v.x     v16, zero       /* clear out table */
> +    vmv.v.x     v8, zero        /* clear out v8 */
> +    li          t3, 1
> +    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte */
> +
> +1:  vmv.v.x     v2, zero        /* clear out v2 */
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* divide the byte we read earlier by 8 */
> +    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0 elsewhere */
> +    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0 elsewhere */
> +    vor.vv      v16, v16, v2    /* or it in */
> +    lbu         t0, 0(a1)       /* fetch next bute */
> +    bnez        t0, 1b          /* if it's null, go round again */
> +
> +/*
> + *   Table is now built in v16.
> + *   Strategy:
> + *   - fetch next t1 bytes from memory
> + *   - vrgather on their values divided by 8 to get relevant bytes of table
> + *   - shift right to get the correct bit into bit 1
> + *   - and with 1, compare with expected terminator value, then check mask
> + *     to see if we've found a terminator
> + *
> + *   Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + *   the next t1 bytes - any of which may be the null terminator -
> + *   we do not cross a page boundary and read unmapped memory. Therefore
> + *   we have one read of however many bytes are needed to align a0,
> + *   before the main loop.
> + */
> +
> +.Lscan_table:
> +    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
> +
> +    and         t2, a0, t1          /* mask to align to t1 */
> +    beqz        t2, 2f              /* or skip if we're already aligned */
> +    sub         t2, t1, t2          /* t2 now bytes to read to align to t1 */
> +
> +    vid.v       v2                  /* build mask instead of changing vl */
> +    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
> +
> +    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
> +    vsrl.vi     v4, v2, 3           /* divide by 8 */
> +    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table */
> +    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
> +    vand.vv     v6, v6, v8          /* and with 1 to complete the lookups */
> +    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are present */
> +    vfirst.m    t0, v4              /* index of the first 0, if any */
> +    bgez        t0, .Lscan_end      /* if we found one, stop */
> +    add         a0, a0, t2          /* advance by number of bytes we read */
> +
> +2:  add         a6, a6, t1     /* we'll advance a0 before the exit check */
> +1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per pass */
> +    add         a0, a0, t1
> +
> +    vsrl.vi     v4, v2, 3
> +    vrgather.vv v6, v16, v4
> +    vsrl.vv     v6, v6, v2
> +    vand.vv     v6, v6, v8
> +
> +    vmseq.vx    v4, v6, zero
> +    vfirst.m    t0, v4
> +    bltz        t0, 1b
> +
> +.Lscan_end:
> +    add         a0, a0, t0     /* calculate offset to terminating byte */
> +    sub         a0, a0, a6
> +    ret
> +.size   strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> +    lbu         t0, 0(a1)
> +    beqz        t0, strlen   /* no rejections -> prefix is whole string */
> +
> +    mv          a6, a0
> +    li          t1, 32
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    vmv.v.x     v8, zero
> +    li          t3, 1           /* all bits clear except bit 0 of byte 0 */
> +    vmv.s.x     v8, t3
> +    vnot.v      v16, v8         /* v16 is the inverse of that */
> +    li          t4, -1
> +
> +1:  vmv.v.x     v2, zero
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* select correct bit in v2 */
> +    vslideup.vx v2, v8, t2
> +    vsll.vx     v2, v2, t0
> +    vnot.v      v2, v2          /* invert */
> +    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
> +    lbu         t0, 0(a1)
> +    bnez        t0, 1b
> +    j           .Lscan_table
> +.size   strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
> --
> 2.34.1
>