public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Xi Ruoyao <xry111@xry111.site>
To: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>,
	 "dengjianbo@loongson.cn" <dengjianbo@loongson.cn>
Cc: libc-alpha <libc-alpha@sourceware.org>,
	caiyinyu <caiyinyu@loongson.cn>,
	xuchenghua <xuchenghua@loongson.cn>,
	"i.swmail" <i.swmail@xen0n.name>,
	joseph <joseph@codesourcery.com>
Subject: Re: [PATCH 0/2] LoongArch: Add optimized functions.
Date: Mon, 26 Sep 2022 21:49:04 +0800	[thread overview]
Message-ID: <8411c465e01de9608633f8b1fd2d82d3ef16f001.camel@xry111.site> (raw)
In-Reply-To: <1fec4245-9eb4-108d-722e-ba36a1df0023@linaro.org>

[-- Attachment #1: Type: text/plain, Size: 762 bytes --]

Hi Adhemerval and Jianbo,

I've customized string-fzi.h and string-maskoff.h for LoongArch (see
attachment).  With them on top of Adhermerval's v5 "Improve generic
string routines" patch and GCC & Binutils trunk, the benchmark result
seems comparable with the assembly version for strchr, strcmp, and
strchrnul.

By the way I've tried to unroll the loop in strchr manually, but then
the compiler produced some bad thing (moving words from a register to
another with no reason) and the result is slower.

I've not really plotted the the result, just took a quick look with my
eyes.  You can try the bench with my headers in sysdeps/loongarch.
> 

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

[-- Attachment #2: string-maskoff.h --]
[-- Type: text/x-chdr, Size: 2955 bytes --]

/* Mask off bits.  LoongArch version.
   Copyright (C) 2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _STRING_MASKOFF_H
#define _STRING_MASKOFF_H 1

#include <endian.h>
#include <limits.h>
#include <stdint.h>
#include <string-optype.h>

/* Provide a mask based on the pointer alignment that sets up non-zero
   bytes before the beginning of the word.  It is used to mask off
   undesirable bits from an aligned read from an unaligned pointer.
   For instance, on a 64 bits machine with a pointer alignment of
   3 the function returns 0x0000000000ffffff for LE and 0xffffff0000000000
   (meaning to mask off the initial 3 bytes).  */
static inline op_t
create_mask (uintptr_t i)
{
  i = i % sizeof (op_t);
  return ~(((op_t)-1) << (i * CHAR_BIT));
}

/* Setup an word with each byte being c_in.  For instance, on a 64 bits
   machine with input as 0xce the functions returns 0xcececececececece.  */
static inline op_t
repeat_bytes (unsigned char c_in)
{
  op_t r = c_in * 0x01010101;

  _Static_assert (sizeof (op_t) == 4 || sizeof (op_t) == 8,
		  "unsupported op_t size");

  if (sizeof (op_t) == 8)
    asm ("bstrins.d\t%0, %0, 63, 32" : "+r" (r));

  return r;
}

/* Based on mask created by 'create_mask', mask off the high bit of each
   byte in the mask.  It is used to mask off undesirable bits from an
   aligned read from an unaligned pointer, and also taking care to avoid
   match possible bytes meant to be matched.  For instance, on a 64 bits
   machine with a mask created from a pointer with an alignment of 3
   (0x0000000000ffffff) the function returns 0x7f7f7f0000000000 for BE
   and 0x00000000007f7f7f for LE.  */
static inline op_t
highbit_mask (op_t m)
{
  return m & repeat_bytes (0x7f);
}

/* Return the address of the op_t word containing the address P.  For
   instance on address 0x0011223344556677 and op_t with size of 8,
   it returns 0x0011223344556670.  */
static inline op_t *
word_containing (char const *p)
{
  _Static_assert (sizeof (op_t) == 4 || sizeof (op_t) == 8,
		  "unsupported op_t size");

  if (sizeof (op_t) == 8)
    asm ("bstrins.d\t%0, $zero, 2, 0" : "+r" (p));
  else
    asm ("bstrins.d\t%0, $zero, 1, 0" : "+r" (p));
  return (op_t *) p;
}

#endif /* _STRING_MASKOFF_H  */

[-- Attachment #3: string-fzi.h --]
[-- Type: text/x-chdr, Size: 2825 bytes --]

/* Zero byte detection; indexes.  LoongArch version.
   Copyright (C) 2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#ifndef _STRING_FZI_H
#define _STRING_FZI_H 1

#include <limits.h>
#include <endian.h>
#include <string-fza.h>
#include <gmp.h>
#include <stdlib/gmp-impl.h>
#include <stdlib/longlong.h>

/* A subroutine for the index_zero functions.  Given a test word C, return
   the (memory order) index of the first byte (in memory order) that is
   non-zero.  */
static inline unsigned int
index_first_ (op_t c)
{
  _Static_assert (sizeof (op_t) == sizeof (long), "op_t must be long");

  return __builtin_ctzl (c) / CHAR_BIT;
}

/* Similarly, but return the (memory order) index of the last byte that is
   non-zero.  */
static inline unsigned int
index_last_ (op_t c)
{
  _Static_assert (sizeof (op_t) == sizeof (long), "op_t must be long");

  return sizeof (op_t) - 1 - (__builtin_clzl (c) / CHAR_BIT);
}

/* Given a word X that is known to contain a zero byte, return the index of
   the first such within the word in memory order.  */
static inline unsigned int
index_first_zero (op_t x)
{
  x = find_zero_low (x);
  return index_first_ (x);
}

/* Similarly, but perform the search for byte equality between X1 and X2.  */
static inline unsigned int
index_first_eq (op_t x1, op_t x2)
{
  x1 = find_eq_low (x1, x2);
  return index_first_ (x1);
}

/* Similarly, but perform the search for zero within X1 or equality between
   X1 and X2.  */
static inline unsigned int
index_first_zero_eq (op_t x1, op_t x2)
{
  x1 = find_zero_eq_low (x1, x2);
  return index_first_ (x1);
}

/* Similarly, but perform the search for zero within X1 or inequality between
   X1 and X2.  */
static inline unsigned int
index_first_zero_ne (op_t x1, op_t x2)
{
  x1 = find_zero_ne_low (x1, x2);
  return index_first_ (x1);
}

/* Similarly, but search for the last zero within X.  */
static inline unsigned int
index_last_zero (op_t x)
{
  x = find_zero_all (x);
  return index_last_ (x);
}

static inline unsigned int
index_last_eq (op_t x1, op_t x2)
{
  return index_last_zero (x1 ^ x2);
}

#endif /* STRING_FZI_H */

  reply	other threads:[~2022-09-26 13:49 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-15  8:57 caiyinyu
2022-08-15  8:57 ` [PATCH 1/2] LoongArch: Add optimized string functions: str{chr, chrnul, cmp, ncmp} caiyinyu
2022-08-15  8:57 ` [PATCH 2/2] LoongArch: Add optimized function: memmove caiyinyu
2022-08-15 14:02 ` [PATCH 0/2] LoongArch: Add optimized functions Carlos O'Donell
2022-08-15 20:46   ` Joseph Myers
     [not found]     ` <ccc3c93d-07d0-ea9b-562c-aeaec8914f20@loongson.cn>
2022-09-02  9:05       ` Fwd: " dengjianbo
2022-09-02 12:27     ` Adhemerval Zanella Netto
     [not found]       ` <403f78f0-55d9-48cf-c62a-4a0462a76987@loongson.cn>
2022-09-19  2:03         ` dengjianbo
2022-09-19 20:16           ` Adhemerval Zanella Netto
2022-09-20  9:54             ` Xi Ruoyao
2022-09-22 18:05               ` Adhemerval Zanella Netto
2022-09-26 13:49                 ` Xi Ruoyao [this message]
2022-09-28 14:22                   ` Richard Henderson
2022-09-28 16:42                     ` Xi Ruoyao
2022-09-28 19:18                       ` Richard Henderson
2022-10-10  1:39                         ` Lulu Cheng
2022-09-29  3:00                       ` Lulu Cheng
2022-09-29 11:45                   ` Adhemerval Zanella Netto

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8411c465e01de9608633f8b1fd2d82d3ef16f001.camel@xry111.site \
    --to=xry111@xry111.site \
    --cc=adhemerval.zanella@linaro.org \
    --cc=caiyinyu@loongson.cn \
    --cc=dengjianbo@loongson.cn \
    --cc=i.swmail@xen0n.name \
    --cc=joseph@codesourcery.com \
    --cc=libc-alpha@sourceware.org \
    --cc=xuchenghua@loongson.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).