* [PATCH] libstdc++: Avoid ASCII assumptions in floating_from_chars.cc
@ 2022-04-21 14:37 Patrick Palka
2022-04-21 14:52 ` Jonathan Wakely
0 siblings, 1 reply; 2+ messages in thread
From: Patrick Palka @ 2022-04-21 14:37 UTC (permalink / raw)
To: gcc-patches; +Cc: libstdc++, Patrick Palka
In starts_with_ci and in __floating_from_chars_hex's inf/nan handling,
we were assuming that the letters are contiguous and that 'A' + 32 == 'a'
which is true for ASCII but not for other character encodings. This
patch fixes starts_with_ci by using a constexpr lookup table that maps
uppercase letters to lowercase, and fixes __floating_from_chars_hex by
using __from_chars_alnum_to_val.
Tested on x86_64-pc-linux-gnu, does this look OK for trunk?
libstdc++-v3/ChangeLog:
* include/std/charconv (__from_chars_alnum_to_val_table):
Simplify initialization of __lower/__upper_letters.
(__from_chars_alnum_to_val): Default the template parameter to
false.
* src/c++17/floating_from_chars.cc (starts_with_ci): Don't
assume the uppercase and lowercase letters are contiguous.
(__floating_from_chars_hex): Likewise.
---
libstdc++-v3/include/std/charconv | 12 ++-----
libstdc++-v3/src/c++17/floating_from_chars.cc | 33 ++++++++++++++-----
2 files changed, 28 insertions(+), 17 deletions(-)
diff --git a/libstdc++-v3/include/std/charconv b/libstdc++-v3/include/std/charconv
index 561234cb2fc..218813e4797 100644
--- a/libstdc++-v3/include/std/charconv
+++ b/libstdc++-v3/include/std/charconv
@@ -412,14 +412,8 @@ namespace __detail
constexpr auto
__from_chars_alnum_to_val_table()
{
- constexpr unsigned char __lower_letters[]
- = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
- 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
- 'u', 'v', 'w', 'x', 'y', 'z' };
- constexpr unsigned char __upper_letters[]
- = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
- 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
- 'U', 'V', 'W', 'X', 'Y', 'Z' };
+ constexpr unsigned char __lower_letters[27] = "abcdefghijklmnopqrstuvwxyz";
+ constexpr unsigned char __upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
struct { unsigned char __data[1u << __CHAR_BIT__] = {}; } __table;
for (auto& __entry : __table.__data)
__entry = 127;
@@ -437,7 +431,7 @@ namespace __detail
// return its corresponding base-10 value, otherwise return a value >= 127.
// If _DecOnly is false: if the character is an alphanumeric digit, then
// return its corresponding base-36 value, otherwise return a value >= 127.
- template<bool _DecOnly>
+ template<bool _DecOnly = false>
unsigned char
__from_chars_alnum_to_val(unsigned char __c)
{
diff --git a/libstdc++-v3/src/c++17/floating_from_chars.cc b/libstdc++-v3/src/c++17/floating_from_chars.cc
index 0f5183aa9b5..71fb8c5c9a3 100644
--- a/libstdc++-v3/src/c++17/floating_from_chars.cc
+++ b/libstdc++-v3/src/c++17/floating_from_chars.cc
@@ -30,6 +30,7 @@
// Prefer to use std::pmr::string if possible, which requires the cxx11 ABI.
#define _GLIBCXX_USE_CXX11_ABI 1
+#include <array>
#include <charconv>
#include <bit>
#include <string>
@@ -451,15 +452,33 @@ namespace
#if _GLIBCXX_FLOAT_IS_IEEE_BINARY32 && _GLIBCXX_DOUBLE_IS_IEEE_BINARY64
// Return true iff [FIRST,LAST) begins with PREFIX, ignoring case.
+ // PREFIX is assumed to not contain any uppercase letters.
bool
starts_with_ci(const char* first, const char* last, string_view prefix)
{
__glibcxx_requires_valid_range(first, last);
- for (char ch : prefix)
+ // A lookup table that maps uppercase letters to lowercase and
+ // is otherwise the identity mapping.
+ static constexpr auto upper_to_lower_table = [] {
+ constexpr unsigned char lower_letters[27] = "abcdefghijklmnopqrstuvwxyz";
+ constexpr unsigned char upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ std::array<unsigned char, (1u << __CHAR_BIT__)> table = {};
+ for (unsigned i = 0; i < table.size(); ++i)
+ table[i] = i;
+ for (unsigned i = 0; i < 26; ++i)
+ table[upper_letters[i]] = lower_letters[i];
+ return table;
+ }();
+
+ if (last - first < static_cast<ptrdiff_t>(prefix.length()))
+ return false;
+
+ for (const unsigned char pch : prefix)
{
- __glibcxx_assert(ch >= 'a' && ch <= 'z');
- if (first == last || (*first != ch && *first != ch - 32))
+ __glibcxx_assert(pch == upper_to_lower_table[pch]);
+ const unsigned char ch = *first;
+ if (ch != pch && upper_to_lower_table[ch] != pch)
return false;
++first;
}
@@ -535,10 +554,8 @@ namespace
++first;
break;
}
- else if ((ch >= '0' && ch <= '9')
- || (ch >= 'a' && ch <= 'z')
- || (ch >= 'A' && ch <= 'Z')
- || ch == '_')
+ else if (ch == '_'
+ || __detail::__from_chars_alnum_to_val(ch) < 127)
continue;
else
{
@@ -599,7 +616,7 @@ namespace
continue;
}
- int hexit = __detail::__from_chars_alnum_to_val<false>(ch);
+ int hexit = __detail::__from_chars_alnum_to_val(ch);
if (hexit >= 16)
break;
seen_hexit = true;
--
2.36.0.rc2.10.g1ac7422e39
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] libstdc++: Avoid ASCII assumptions in floating_from_chars.cc
2022-04-21 14:37 [PATCH] libstdc++: Avoid ASCII assumptions in floating_from_chars.cc Patrick Palka
@ 2022-04-21 14:52 ` Jonathan Wakely
0 siblings, 0 replies; 2+ messages in thread
From: Jonathan Wakely @ 2022-04-21 14:52 UTC (permalink / raw)
To: Patrick Palka; +Cc: gcc Patches, libstdc++
On Thu, 21 Apr 2022 at 15:38, Patrick Palka via Libstdc++
<libstdc++@gcc.gnu.org> wrote:
>
> In starts_with_ci and in __floating_from_chars_hex's inf/nan handling,
> we were assuming that the letters are contiguous and that 'A' + 32 == 'a'
> which is true for ASCII but not for other character encodings. This
> patch fixes starts_with_ci by using a constexpr lookup table that maps
> uppercase letters to lowercase, and fixes __floating_from_chars_hex by
> using __from_chars_alnum_to_val.
>
> Tested on x86_64-pc-linux-gnu, does this look OK for trunk?
>
> libstdc++-v3/ChangeLog:
>
> * include/std/charconv (__from_chars_alnum_to_val_table):
> Simplify initialization of __lower/__upper_letters.
> (__from_chars_alnum_to_val): Default the template parameter to
> false.
> * src/c++17/floating_from_chars.cc (starts_with_ci): Don't
> assume the uppercase and lowercase letters are contiguous.
> (__floating_from_chars_hex): Likewise.
> ---
> libstdc++-v3/include/std/charconv | 12 ++-----
> libstdc++-v3/src/c++17/floating_from_chars.cc | 33 ++++++++++++++-----
> 2 files changed, 28 insertions(+), 17 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/charconv b/libstdc++-v3/include/std/charconv
> index 561234cb2fc..218813e4797 100644
> --- a/libstdc++-v3/include/std/charconv
> +++ b/libstdc++-v3/include/std/charconv
> @@ -412,14 +412,8 @@ namespace __detail
> constexpr auto
> __from_chars_alnum_to_val_table()
> {
> - constexpr unsigned char __lower_letters[]
> - = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
> - 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
> - 'u', 'v', 'w', 'x', 'y', 'z' };
> - constexpr unsigned char __upper_letters[]
> - = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
> - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
> - 'U', 'V', 'W', 'X', 'Y', 'Z' };
> + constexpr unsigned char __lower_letters[27] = "abcdefghijklmnopqrstuvwxyz";
> + constexpr unsigned char __upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
> struct { unsigned char __data[1u << __CHAR_BIT__] = {}; } __table;
> for (auto& __entry : __table.__data)
> __entry = 127;
> @@ -437,7 +431,7 @@ namespace __detail
> // return its corresponding base-10 value, otherwise return a value >= 127.
> // If _DecOnly is false: if the character is an alphanumeric digit, then
> // return its corresponding base-36 value, otherwise return a value >= 127.
> - template<bool _DecOnly>
> + template<bool _DecOnly = false>
> unsigned char
> __from_chars_alnum_to_val(unsigned char __c)
> {
> diff --git a/libstdc++-v3/src/c++17/floating_from_chars.cc b/libstdc++-v3/src/c++17/floating_from_chars.cc
> index 0f5183aa9b5..71fb8c5c9a3 100644
> --- a/libstdc++-v3/src/c++17/floating_from_chars.cc
> +++ b/libstdc++-v3/src/c++17/floating_from_chars.cc
> @@ -30,6 +30,7 @@
> // Prefer to use std::pmr::string if possible, which requires the cxx11 ABI.
> #define _GLIBCXX_USE_CXX11_ABI 1
>
> +#include <array>
> #include <charconv>
> #include <bit>
> #include <string>
> @@ -451,15 +452,33 @@ namespace
>
> #if _GLIBCXX_FLOAT_IS_IEEE_BINARY32 && _GLIBCXX_DOUBLE_IS_IEEE_BINARY64
> // Return true iff [FIRST,LAST) begins with PREFIX, ignoring case.
> + // PREFIX is assumed to not contain any uppercase letters.
> bool
> starts_with_ci(const char* first, const char* last, string_view prefix)
> {
> __glibcxx_requires_valid_range(first, last);
>
> - for (char ch : prefix)
> + // A lookup table that maps uppercase letters to lowercase and
> + // is otherwise the identity mapping.
> + static constexpr auto upper_to_lower_table = [] {
> + constexpr unsigned char lower_letters[27] = "abcdefghijklmnopqrstuvwxyz";
> + constexpr unsigned char upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
> + std::array<unsigned char, (1u << __CHAR_BIT__)> table = {};
> + for (unsigned i = 0; i < table.size(); ++i)
> + table[i] = i;
> + for (unsigned i = 0; i < 26; ++i)
> + table[upper_letters[i]] = lower_letters[i];
> + return table;
> + }();
> +
> + if (last - first < static_cast<ptrdiff_t>(prefix.length()))
> + return false;
> +
> + for (const unsigned char pch : prefix)
> {
> - __glibcxx_assert(ch >= 'a' && ch <= 'z');
> - if (first == last || (*first != ch && *first != ch - 32))
> + __glibcxx_assert(pch == upper_to_lower_table[pch]);
OK with this assertion commented out. It's another check that the impl
isn't broken, which users can't do anything about if it's broken.
> + const unsigned char ch = *first;
> + if (ch != pch && upper_to_lower_table[ch] != pch)
> return false;
> ++first;
> }
> @@ -535,10 +554,8 @@ namespace
> ++first;
> break;
> }
> - else if ((ch >= '0' && ch <= '9')
> - || (ch >= 'a' && ch <= 'z')
> - || (ch >= 'A' && ch <= 'Z')
> - || ch == '_')
> + else if (ch == '_'
> + || __detail::__from_chars_alnum_to_val(ch) < 127)
> continue;
> else
> {
> @@ -599,7 +616,7 @@ namespace
> continue;
> }
>
> - int hexit = __detail::__from_chars_alnum_to_val<false>(ch);
> + int hexit = __detail::__from_chars_alnum_to_val(ch);
> if (hexit >= 16)
> break;
> seen_hexit = true;
> --
> 2.36.0.rc2.10.g1ac7422e39
>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2022-04-21 14:52 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-21 14:37 [PATCH] libstdc++: Avoid ASCII assumptions in floating_from_chars.cc Patrick Palka
2022-04-21 14:52 ` Jonathan Wakely
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).