From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124]) by sourceware.org (Postfix) with ESMTPS id 6808F3856DD6 for ; Thu, 21 Apr 2022 14:37:34 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 6808F3856DD6 Received: from mail-qv1-f72.google.com (mail-qv1-f72.google.com [209.85.219.72]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id us-mta-145-0wOJfqTkPWG-iy7sk8zZnA-1; Thu, 21 Apr 2022 10:37:32 -0400 X-MC-Unique: 0wOJfqTkPWG-iy7sk8zZnA-1 Received: by mail-qv1-f72.google.com with SMTP id e2-20020a0cf342000000b00446596663a9so4053359qvm.13 for ; Thu, 21 Apr 2022 07:37:32 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version :content-transfer-encoding; bh=ld/9QdvSaCkWjjz+wOuxkqAjwkqzjnkmZpbBs5Rnfn4=; b=NuPU4+hel3gCF5Ih0OrbdyRGzF42TIeeO8EcA5b5k7YjrjQcWGGjNvYtJHzwiRDunl 4XLT2qu5zvVJoGwY8hvcdIGUMuMHWC+06zd1EMNhFOVSDbefqt6KYQtaF5haQ+NntF4s yY9ExSHe5jvcwtUa7ZKYMMICtnUJUlSqoEOmUaW/ncoH3vnKzv6k8D9fNONHUGwlCdfO O31hWYyDkrobUTAVK4MOOqWAkFjduGJjVVQbDUCL7RUJbXQF3rcHJLRKP5QgalPtbi1w +3/gyjtj/pY/AcMLq6+BdfiYqPD059X8malKWRU2ugy8OoNmOq/rpLt0MfFAru3O2shR ZKnA== X-Gm-Message-State: AOAM532KtWNb88sgokRnsfqThgQ7k9uKaKLdHS7+Dhqz50/An0oxHwbC 3VxgkePStPk2W0gsjRfc2kGV9o8YykHenNa13wWEsbF/nBF7MEBM8ZLoiL6jiV6QnRQunGAfzGM R41/24VQdCKjJ05o= X-Received: by 2002:a05:6214:252c:b0:446:5dd8:4217 with SMTP id gg12-20020a056214252c00b004465dd84217mr21992qvb.95.1650551852301; Thu, 21 Apr 2022 07:37:32 -0700 (PDT) X-Google-Smtp-Source: ABdhPJyhTTN3oYkgw3TOJdohWc3t/NRkMFfbvD8HAkSX4uwTwHxE9FCX6mHCS+8DTQGrSgFEJGOKSw== X-Received: by 2002:a05:6214:252c:b0:446:5dd8:4217 with SMTP id gg12-20020a056214252c00b004465dd84217mr21970qvb.95.1650551852056; Thu, 21 Apr 2022 07:37:32 -0700 (PDT) Received: from localhost.localdomain (ool-18e40894.dyn.optonline.net. [24.228.8.148]) by smtp.gmail.com with ESMTPSA id x22-20020a05620a099600b0069e5db6be55sm2996990qkx.36.2022.04.21.07.37.31 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 21 Apr 2022 07:37:31 -0700 (PDT) From: Patrick Palka To: gcc-patches@gcc.gnu.org Cc: libstdc++@gcc.gnu.org, Patrick Palka Subject: [PATCH] libstdc++: Avoid ASCII assumptions in floating_from_chars.cc Date: Thu, 21 Apr 2022 10:37:29 -0400 Message-Id: <20220421143729.1424170-1-ppalka@redhat.com> X-Mailer: git-send-email 2.36.0.rc2.10.g1ac7422e39 MIME-Version: 1.0 X-Mimecast-Spam-Score: 0 X-Mimecast-Originator: redhat.com Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="US-ASCII"; x-default=true X-Spam-Status: No, score=-14.6 required=5.0 tests=BAYES_00, DKIMWL_WL_HIGH, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, RCVD_IN_DNSWL_LOW, SPF_HELO_NONE, SPF_NONE, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libstdc++@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libstdc++ mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 21 Apr 2022 14:37:36 -0000 In starts_with_ci and in __floating_from_chars_hex's inf/nan handling, we were assuming that the letters are contiguous and that 'A' + 32 == 'a' which is true for ASCII but not for other character encodings. This patch fixes starts_with_ci by using a constexpr lookup table that maps uppercase letters to lowercase, and fixes __floating_from_chars_hex by using __from_chars_alnum_to_val. Tested on x86_64-pc-linux-gnu, does this look OK for trunk? libstdc++-v3/ChangeLog: * include/std/charconv (__from_chars_alnum_to_val_table): Simplify initialization of __lower/__upper_letters. (__from_chars_alnum_to_val): Default the template parameter to false. * src/c++17/floating_from_chars.cc (starts_with_ci): Don't assume the uppercase and lowercase letters are contiguous. (__floating_from_chars_hex): Likewise. --- libstdc++-v3/include/std/charconv | 12 ++----- libstdc++-v3/src/c++17/floating_from_chars.cc | 33 ++++++++++++++----- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/libstdc++-v3/include/std/charconv b/libstdc++-v3/include/std/charconv index 561234cb2fc..218813e4797 100644 --- a/libstdc++-v3/include/std/charconv +++ b/libstdc++-v3/include/std/charconv @@ -412,14 +412,8 @@ namespace __detail constexpr auto __from_chars_alnum_to_val_table() { - constexpr unsigned char __lower_letters[] - = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', - 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', - 'u', 'v', 'w', 'x', 'y', 'z' }; - constexpr unsigned char __upper_letters[] - = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z' }; + constexpr unsigned char __lower_letters[27] = "abcdefghijklmnopqrstuvwxyz"; + constexpr unsigned char __upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; struct { unsigned char __data[1u << __CHAR_BIT__] = {}; } __table; for (auto& __entry : __table.__data) __entry = 127; @@ -437,7 +431,7 @@ namespace __detail // return its corresponding base-10 value, otherwise return a value >= 127. // If _DecOnly is false: if the character is an alphanumeric digit, then // return its corresponding base-36 value, otherwise return a value >= 127. - template + template unsigned char __from_chars_alnum_to_val(unsigned char __c) { diff --git a/libstdc++-v3/src/c++17/floating_from_chars.cc b/libstdc++-v3/src/c++17/floating_from_chars.cc index 0f5183aa9b5..71fb8c5c9a3 100644 --- a/libstdc++-v3/src/c++17/floating_from_chars.cc +++ b/libstdc++-v3/src/c++17/floating_from_chars.cc @@ -30,6 +30,7 @@ // Prefer to use std::pmr::string if possible, which requires the cxx11 ABI. #define _GLIBCXX_USE_CXX11_ABI 1 +#include #include #include #include @@ -451,15 +452,33 @@ namespace #if _GLIBCXX_FLOAT_IS_IEEE_BINARY32 && _GLIBCXX_DOUBLE_IS_IEEE_BINARY64 // Return true iff [FIRST,LAST) begins with PREFIX, ignoring case. + // PREFIX is assumed to not contain any uppercase letters. bool starts_with_ci(const char* first, const char* last, string_view prefix) { __glibcxx_requires_valid_range(first, last); - for (char ch : prefix) + // A lookup table that maps uppercase letters to lowercase and + // is otherwise the identity mapping. + static constexpr auto upper_to_lower_table = [] { + constexpr unsigned char lower_letters[27] = "abcdefghijklmnopqrstuvwxyz"; + constexpr unsigned char upper_letters[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + std::array table = {}; + for (unsigned i = 0; i < table.size(); ++i) + table[i] = i; + for (unsigned i = 0; i < 26; ++i) + table[upper_letters[i]] = lower_letters[i]; + return table; + }(); + + if (last - first < static_cast(prefix.length())) + return false; + + for (const unsigned char pch : prefix) { - __glibcxx_assert(ch >= 'a' && ch <= 'z'); - if (first == last || (*first != ch && *first != ch - 32)) + __glibcxx_assert(pch == upper_to_lower_table[pch]); + const unsigned char ch = *first; + if (ch != pch && upper_to_lower_table[ch] != pch) return false; ++first; } @@ -535,10 +554,8 @@ namespace ++first; break; } - else if ((ch >= '0' && ch <= '9') - || (ch >= 'a' && ch <= 'z') - || (ch >= 'A' && ch <= 'Z') - || ch == '_') + else if (ch == '_' + || __detail::__from_chars_alnum_to_val(ch) < 127) continue; else { @@ -599,7 +616,7 @@ namespace continue; } - int hexit = __detail::__from_chars_alnum_to_val(ch); + int hexit = __detail::__from_chars_alnum_to_val(ch); if (hexit >= 16) break; seen_hexit = true; -- 2.36.0.rc2.10.g1ac7422e39