public inbox for libstdc++@gcc.gnu.org
 help / color / mirror / Atom feed
From: Jonathan Wakely <jwakely@redhat.com>
To: Patrick Palka <ppalka@redhat.com>
Cc: gcc Patches <gcc-patches@gcc.gnu.org>,
	"libstdc++" <libstdc++@gcc.gnu.org>
Subject: Re: [PATCH] libstdc++: Avoid ASCII assumptions in floating_from_chars.cc
Date: Thu, 21 Apr 2022 15:52:43 +0100	[thread overview]
Message-ID: <CACb0b4=GSZtfN8Q7WsUb_e3Bqf90Y6rVjsT7Gu=TW9k9+-Xq-Q@mail.gmail.com> (raw)
In-Reply-To: <20220421143729.1424170-1-ppalka@redhat.com>

On Thu, 21 Apr 2022 at 15:38, Patrick Palka via Libstdc++
<libstdc++@gcc.gnu.org> wrote:
>
> In starts_with_ci and in __floating_from_chars_hex's inf/nan handling,
> we were assuming that the letters are contiguous and that 'A' + 32 == 'a'
> which is true for ASCII but not for other character encodings.  This
> patch fixes starts_with_ci by using a constexpr lookup table that maps
> uppercase letters to lowercase, and fixes __floating_from_chars_hex by
> using __from_chars_alnum_to_val.
>
> Tested on x86_64-pc-linux-gnu, does this look OK for trunk?
>
> libstdc++-v3/ChangeLog:
>
>         * include/std/charconv (__from_chars_alnum_to_val_table):
>         Simplify initialization of __lower/__upper_letters.
>         (__from_chars_alnum_to_val): Default the template parameter to
>         false.
>         * src/c++17/floating_from_chars.cc (starts_with_ci): Don't
>         assume the uppercase and lowercase letters are contiguous.
>         (__floating_from_chars_hex): Likewise.
> ---
>  libstdc++-v3/include/std/charconv             | 12 ++-----
>  libstdc++-v3/src/c++17/floating_from_chars.cc | 33 ++++++++++++++-----
>  2 files changed, 28 insertions(+), 17 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/charconv b/libstdc++-v3/include/std/charconv
> index 561234cb2fc..218813e4797 100644
> --- a/libstdc++-v3/include/std/charconv
> +++ b/libstdc++-v3/include/std/charconv
> @@ -412,14 +412,8 @@ namespace __detail
>    constexpr auto
>    __from_chars_alnum_to_val_table()
>    {
> -    constexpr unsigned char __lower_letters[]
> -      = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
> -         'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
> -         'u', 'v', 'w', 'x', 'y', 'z' };
> -    constexpr unsigned char __upper_letters[]
> -      = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
> -         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
> -         'U', 'V', 'W', 'X', 'Y', 'Z' };
> +    constexpr unsigned char __lower_letters[27] = "abcdefghijklmnopqrstuvwxyz";
> +    constexpr unsigned char __upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
>      struct { unsigned char __data[1u << __CHAR_BIT__] = {}; } __table;
>      for (auto& __entry : __table.__data)
>        __entry = 127;
> @@ -437,7 +431,7 @@ namespace __detail
>    // return its corresponding base-10 value, otherwise return a value >= 127.
>    // If _DecOnly is false: if the character is an alphanumeric digit, then
>    // return its corresponding base-36 value, otherwise return a value >= 127.
> -  template<bool _DecOnly>
> +  template<bool _DecOnly = false>
>      unsigned char
>      __from_chars_alnum_to_val(unsigned char __c)
>      {
> diff --git a/libstdc++-v3/src/c++17/floating_from_chars.cc b/libstdc++-v3/src/c++17/floating_from_chars.cc
> index 0f5183aa9b5..71fb8c5c9a3 100644
> --- a/libstdc++-v3/src/c++17/floating_from_chars.cc
> +++ b/libstdc++-v3/src/c++17/floating_from_chars.cc
> @@ -30,6 +30,7 @@
>  // Prefer to use std::pmr::string if possible, which requires the cxx11 ABI.
>  #define _GLIBCXX_USE_CXX11_ABI 1
>
> +#include <array>
>  #include <charconv>
>  #include <bit>
>  #include <string>
> @@ -451,15 +452,33 @@ namespace
>
>  #if _GLIBCXX_FLOAT_IS_IEEE_BINARY32 && _GLIBCXX_DOUBLE_IS_IEEE_BINARY64
>    // Return true iff [FIRST,LAST) begins with PREFIX, ignoring case.
> +  // PREFIX is assumed to not contain any uppercase letters.
>    bool
>    starts_with_ci(const char* first, const char* last, string_view prefix)
>    {
>      __glibcxx_requires_valid_range(first, last);
>
> -    for (char ch : prefix)
> +    // A lookup table that maps uppercase letters to lowercase and
> +    // is otherwise the identity mapping.
> +    static constexpr auto upper_to_lower_table = [] {
> +      constexpr unsigned char lower_letters[27] = "abcdefghijklmnopqrstuvwxyz";
> +      constexpr unsigned char upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
> +      std::array<unsigned char, (1u << __CHAR_BIT__)> table = {};
> +      for (unsigned i = 0; i < table.size(); ++i)
> +       table[i] = i;
> +      for (unsigned i = 0; i < 26; ++i)
> +       table[upper_letters[i]] = lower_letters[i];
> +      return table;
> +    }();
> +
> +    if (last - first < static_cast<ptrdiff_t>(prefix.length()))
> +      return false;
> +
> +    for (const unsigned char pch : prefix)
>        {
> -       __glibcxx_assert(ch >= 'a' && ch <= 'z');
> -       if (first == last || (*first != ch && *first != ch - 32))
> +       __glibcxx_assert(pch == upper_to_lower_table[pch]);

OK with this assertion commented out. It's another check that the impl
isn't broken, which users can't do anything about if it's broken.


> +       const unsigned char ch = *first;
> +       if (ch != pch && upper_to_lower_table[ch] != pch)
>           return false;
>         ++first;
>        }
> @@ -535,10 +554,8 @@ namespace
>                           ++first;
>                           break;
>                         }
> -                     else if ((ch >= '0' && ch <= '9')
> -                              || (ch >= 'a' && ch <= 'z')
> -                              || (ch >= 'A' && ch <= 'Z')
> -                              || ch == '_')
> +                     else if (ch == '_'
> +                              || __detail::__from_chars_alnum_to_val(ch) < 127)
>                         continue;
>                       else
>                         {
> @@ -599,7 +616,7 @@ namespace
>             continue;
>           }
>
> -       int hexit = __detail::__from_chars_alnum_to_val<false>(ch);
> +       int hexit = __detail::__from_chars_alnum_to_val(ch);
>         if (hexit >= 16)
>           break;
>         seen_hexit = true;
> --
> 2.36.0.rc2.10.g1ac7422e39
>


      reply	other threads:[~2022-04-21 14:52 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-21 14:37 Patrick Palka
2022-04-21 14:52 ` Jonathan Wakely [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CACb0b4=GSZtfN8Q7WsUb_e3Bqf90Y6rVjsT7Gu=TW9k9+-Xq-Q@mail.gmail.com' \
    --to=jwakely@redhat.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=libstdc++@gcc.gnu.org \
    --cc=ppalka@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).