From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124]) by sourceware.org (Postfix) with ESMTPS id 57E5B3856DC8 for ; Thu, 21 Apr 2022 14:52:56 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 57E5B3856DC8 Received: from mail-yb1-f198.google.com (mail-yb1-f198.google.com [209.85.219.198]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id us-mta-207-myV3eh-WN9CORwa92icOdg-1; Thu, 21 Apr 2022 10:52:54 -0400 X-MC-Unique: myV3eh-WN9CORwa92icOdg-1 Received: by mail-yb1-f198.google.com with SMTP id a18-20020a25bad2000000b0063360821ea7so4559855ybk.15 for ; Thu, 21 Apr 2022 07:52:54 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=Mgr8sRg4H003XkENVlevip3qIFtStxY7Fl0sZdrELDQ=; b=eoU4N7lNbPs55c7usx+LXTW0Y6FTQdlIi1hhFdpwNecE8M4jBc2rFkAy2Ir2pqt/Sl j7JGRez4PMrcu0/bFMIvrcaOIIjn/TGWc9FcwalmwiDbX0Uiq4evktrMsDSRZ+VkwPQd IZLHbZYZ0xMONkFQo9chSOpfWMN4LaVVgDx50l7Ci+Lt8H6X2upURj+h0/7Q8xs1572R l8YLKErzHl68wXBwhFfQT2wIb3tSul99ryiTGl6dJOUix4LmkeqR0Bb/aacU5G8pbla/ 5j1BXRdlHH4moJ9wvqIH6HUEX4nLIOgP6sth2kIHBCbwiLjz1irIgLgxV478q8jXks+V Ub/Q== X-Gm-Message-State: AOAM530Cdx/YeyR+JVOhuVAiXHpH5OH6AQf5hJ87Zc3bLKDrNDcBggAY 11fLp9uhSU/hXDW2zg76vPcbfm5MO+rJuBigImPxWbEfK1qA3PNPmLpB71zm+EdfUCdNRlOv63A AjLjRDnO4MH3gxqWw9voRxZlnZagrlNE= X-Received: by 2002:a25:1143:0:b0:645:7507:ae5e with SMTP id 64-20020a251143000000b006457507ae5emr3171030ybr.415.1650552774354; Thu, 21 Apr 2022 07:52:54 -0700 (PDT) X-Google-Smtp-Source: ABdhPJzyWEAG3xMnZN2Y69LEJZU2UwTIykbubRV/VcOaNQ2jPyPJ0cyf3rMaYof4Cv+/KSZZ2+UNgjArveqw7GrcbZU= X-Received: by 2002:a25:1143:0:b0:645:7507:ae5e with SMTP id 64-20020a251143000000b006457507ae5emr3171020ybr.415.1650552774091; Thu, 21 Apr 2022 07:52:54 -0700 (PDT) MIME-Version: 1.0 References: <20220421143729.1424170-1-ppalka@redhat.com> In-Reply-To: <20220421143729.1424170-1-ppalka@redhat.com> From: Jonathan Wakely Date: Thu, 21 Apr 2022 15:52:43 +0100 Message-ID: Subject: Re: [PATCH] libstdc++: Avoid ASCII assumptions in floating_from_chars.cc To: Patrick Palka Cc: gcc Patches , "libstdc++" X-Mimecast-Spam-Score: 0 X-Mimecast-Originator: redhat.com Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-14.7 required=5.0 tests=BAYES_00, DKIMWL_WL_HIGH, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, RCVD_IN_DNSWL_LOW, SPF_HELO_NONE, SPF_NONE, TXREP autolearn=unavailable autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libstdc++@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libstdc++ mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 21 Apr 2022 14:52:59 -0000 On Thu, 21 Apr 2022 at 15:38, Patrick Palka via Libstdc++ wrote: > > In starts_with_ci and in __floating_from_chars_hex's inf/nan handling, > we were assuming that the letters are contiguous and that 'A' + 32 == 'a' > which is true for ASCII but not for other character encodings. This > patch fixes starts_with_ci by using a constexpr lookup table that maps > uppercase letters to lowercase, and fixes __floating_from_chars_hex by > using __from_chars_alnum_to_val. > > Tested on x86_64-pc-linux-gnu, does this look OK for trunk? > > libstdc++-v3/ChangeLog: > > * include/std/charconv (__from_chars_alnum_to_val_table): > Simplify initialization of __lower/__upper_letters. > (__from_chars_alnum_to_val): Default the template parameter to > false. > * src/c++17/floating_from_chars.cc (starts_with_ci): Don't > assume the uppercase and lowercase letters are contiguous. > (__floating_from_chars_hex): Likewise. > --- > libstdc++-v3/include/std/charconv | 12 ++----- > libstdc++-v3/src/c++17/floating_from_chars.cc | 33 ++++++++++++++----- > 2 files changed, 28 insertions(+), 17 deletions(-) > > diff --git a/libstdc++-v3/include/std/charconv b/libstdc++-v3/include/std/charconv > index 561234cb2fc..218813e4797 100644 > --- a/libstdc++-v3/include/std/charconv > +++ b/libstdc++-v3/include/std/charconv > @@ -412,14 +412,8 @@ namespace __detail > constexpr auto > __from_chars_alnum_to_val_table() > { > - constexpr unsigned char __lower_letters[] > - = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', > - 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', > - 'u', 'v', 'w', 'x', 'y', 'z' }; > - constexpr unsigned char __upper_letters[] > - = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', > - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', > - 'U', 'V', 'W', 'X', 'Y', 'Z' }; > + constexpr unsigned char __lower_letters[27] = "abcdefghijklmnopqrstuvwxyz"; > + constexpr unsigned char __upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; > struct { unsigned char __data[1u << __CHAR_BIT__] = {}; } __table; > for (auto& __entry : __table.__data) > __entry = 127; > @@ -437,7 +431,7 @@ namespace __detail > // return its corresponding base-10 value, otherwise return a value >= 127. > // If _DecOnly is false: if the character is an alphanumeric digit, then > // return its corresponding base-36 value, otherwise return a value >= 127. > - template > + template > unsigned char > __from_chars_alnum_to_val(unsigned char __c) > { > diff --git a/libstdc++-v3/src/c++17/floating_from_chars.cc b/libstdc++-v3/src/c++17/floating_from_chars.cc > index 0f5183aa9b5..71fb8c5c9a3 100644 > --- a/libstdc++-v3/src/c++17/floating_from_chars.cc > +++ b/libstdc++-v3/src/c++17/floating_from_chars.cc > @@ -30,6 +30,7 @@ > // Prefer to use std::pmr::string if possible, which requires the cxx11 ABI. > #define _GLIBCXX_USE_CXX11_ABI 1 > > +#include > #include > #include > #include > @@ -451,15 +452,33 @@ namespace > > #if _GLIBCXX_FLOAT_IS_IEEE_BINARY32 && _GLIBCXX_DOUBLE_IS_IEEE_BINARY64 > // Return true iff [FIRST,LAST) begins with PREFIX, ignoring case. > + // PREFIX is assumed to not contain any uppercase letters. > bool > starts_with_ci(const char* first, const char* last, string_view prefix) > { > __glibcxx_requires_valid_range(first, last); > > - for (char ch : prefix) > + // A lookup table that maps uppercase letters to lowercase and > + // is otherwise the identity mapping. > + static constexpr auto upper_to_lower_table = [] { > + constexpr unsigned char lower_letters[27] = "abcdefghijklmnopqrstuvwxyz"; > + constexpr unsigned char upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; > + std::array table = {}; > + for (unsigned i = 0; i < table.size(); ++i) > + table[i] = i; > + for (unsigned i = 0; i < 26; ++i) > + table[upper_letters[i]] = lower_letters[i]; > + return table; > + }(); > + > + if (last - first < static_cast(prefix.length())) > + return false; > + > + for (const unsigned char pch : prefix) > { > - __glibcxx_assert(ch >= 'a' && ch <= 'z'); > - if (first == last || (*first != ch && *first != ch - 32)) > + __glibcxx_assert(pch == upper_to_lower_table[pch]); OK with this assertion commented out. It's another check that the impl isn't broken, which users can't do anything about if it's broken. > + const unsigned char ch = *first; > + if (ch != pch && upper_to_lower_table[ch] != pch) > return false; > ++first; > } > @@ -535,10 +554,8 @@ namespace > ++first; > break; > } > - else if ((ch >= '0' && ch <= '9') > - || (ch >= 'a' && ch <= 'z') > - || (ch >= 'A' && ch <= 'Z') > - || ch == '_') > + else if (ch == '_' > + || __detail::__from_chars_alnum_to_val(ch) < 127) > continue; > else > { > @@ -599,7 +616,7 @@ namespace > continue; > } > > - int hexit = __detail::__from_chars_alnum_to_val(ch); > + int hexit = __detail::__from_chars_alnum_to_val(ch); > if (hexit >= 16) > break; > seen_hexit = true; > -- > 2.36.0.rc2.10.g1ac7422e39 >