From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2181) id 9E00B384B05E; Thu, 7 Jul 2022 23:33:15 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 9E00B384B05E MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Jonathan Wakely To: gcc-cvs@gcc.gnu.org, libstdc++-cvs@gcc.gnu.org Subject: [gcc r11-10130] libstdc++: Fix handling of invalid ranges in std::regex [PR102447] X-Act-Checkin: gcc X-Git-Author: Jonathan Wakely X-Git-Refname: refs/heads/releases/gcc-11 X-Git-Oldrev: ece2bb2adc2c801c4be3c6d0e2656aee633aaddd X-Git-Newrev: c725028a8bb9478ec84332641147ad12b9236922 Message-Id: <20220707233315.9E00B384B05E@sourceware.org> Date: Thu, 7 Jul 2022 23:33:15 +0000 (GMT) X-BeenThere: libstdc++-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libstdc++-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 07 Jul 2022 23:33:15 -0000 https://gcc.gnu.org/g:c725028a8bb9478ec84332641147ad12b9236922 commit r11-10130-gc725028a8bb9478ec84332641147ad12b9236922 Author: Jonathan Wakely Date: Tue Dec 14 14:32:35 2021 +0000 libstdc++: Fix handling of invalid ranges in std::regex [PR102447] std::regex currently allows invalid bracket ranges such as [\w-a] which are only allowed by ECMAScript when in web browser compatibility mode. It should be an error, because the start of the range is a character class, not a single character. The current implementation of _Compiler::_M_expression_term does not provide a way to reject this, because we only remember a previous character, not whether we just processed a character class (or collating symbol etc.) This patch replaces the pair used to emulate optional with a custom class closer to pair. That allows us to track three states, so that we can tell when we've just seen a character class. With this additional state the code in _M_expression_term for processing the _S_token_bracket_dash can be improved to correctly reject the [\w-a] case, without regressing for valid cases such as [\w-] and [----]. libstdc++-v3/ChangeLog: PR libstdc++/102447 * include/bits/regex_compiler.h (_Compiler::_BracketState): New class. (_Compiler::_BrackeyMatcher): New alias template. (_Compiler::_M_expression_term): Change pair parameter to _BracketState. Process first character for ECMAScript syntax as well as POSIX. * include/bits/regex_compiler.tcc (_Compiler::_M_insert_bracket_matcher): Pass _BracketState. (_Compiler::_M_expression_term): Use _BracketState to store state between calls. Improve handling of dashes in ranges. * testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc: Add more tests for ranges containing dashes. Check invalid ranges with character class at the beginning. (cherry picked from commit 7ce3c230edf6e498e125c805a6dd313bf87dc439) Diff: --- libstdc++-v3/include/bits/regex_compiler.h | 40 ++++++- libstdc++-v3/include/bits/regex_compiler.tcc | 118 ++++++++++----------- .../algorithms/regex_match/cstring_bracket_01.cc | 62 ++++++++++- 3 files changed, 152 insertions(+), 68 deletions(-) diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index f224fcb06e0..aa19df2bf9a 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -122,13 +122,45 @@ namespace __detail void _M_insert_bracket_matcher(bool __neg); - // Returns true if successfully matched one term and should continue. + // Cache of the last atom seen in a bracketed range expression. + struct _BracketState + { + enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None; + _CharT _M_char; + + void + set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; } + + _GLIBCXX_NODISCARD _CharT + get() const noexcept { return _M_char; } + + void + reset(_Type __t = _Type::_None) noexcept { _M_type = __t; } + + explicit operator bool() const noexcept + { return _M_type != _Type::_None; } + + // Previous token was a single character. + _GLIBCXX_NODISCARD bool + _M_is_char() const noexcept { return _M_type == _Type::_Char; } + + // Previous token was a character class, equivalent class, + // collating symbol etc. + _GLIBCXX_NODISCARD bool + _M_is_class() const noexcept { return _M_type == _Type::_Class; } + }; + + template + using _BracketMatcher + = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>; + + // Returns true if successfully parsed one term and should continue + // compiling a bracket expression. // Returns false if the compiler should move on. template bool - _M_expression_term(pair& __last_char, - _BracketMatcher<_TraitsT, __icase, __collate>& - __matcher); + _M_expression_term(_BracketState& __last_char, + _BracketMatcher<__icase, __collate>& __matcher); int _M_cur_int_value(int __radix); diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index ea07bc2428e..7769a9e63a3 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -403,7 +403,7 @@ namespace __detail _M_insert_character_class_matcher() { __glibcxx_assert(_M_value.size() == 1); - _BracketMatcher<_TraitsT, __icase, __collate> __matcher + _BracketMatcher<__icase, __collate> __matcher (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); __matcher._M_add_character_class(_M_value, false); __matcher._M_ready(); @@ -417,26 +417,17 @@ namespace __detail _Compiler<_TraitsT>:: _M_insert_bracket_matcher(bool __neg) { - _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); - pair __last_char; // Optional<_CharT> - __last_char.first = false; - if (!(_M_flags & regex_constants::ECMAScript)) - { - if (_M_try_char()) - { - __last_char.first = true; - __last_char.second = _M_value[0]; - } - else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) - { - __last_char.first = true; - __last_char.second = '-'; - } - } + _BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits); + _BracketState __last_char; + if (_M_try_char()) + __last_char.set(_M_value[0]); + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + // Dash as first character is a normal character. + __last_char.set('-'); while (_M_expression_term(__last_char, __matcher)) ; - if (__last_char.first) - __matcher._M_add_char(__last_char.second); + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); __matcher._M_ready(); _M_stack.push(_StateSeqT( *_M_nfa, @@ -447,27 +438,27 @@ namespace __detail template bool _Compiler<_TraitsT>:: - _M_expression_term(pair& __last_char, - _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) + _M_expression_term(_BracketState& __last_char, + _BracketMatcher<__icase, __collate>& __matcher) { if (_M_match_token(_ScannerT::_S_token_bracket_end)) return false; + // Add any previously cached char into the matcher and update cache. const auto __push_char = [&](_CharT __ch) { - if (__last_char.first) - __matcher._M_add_char(__last_char.second); - else - __last_char.first = true; - __last_char.second = __ch; + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); + __last_char.set(__ch); }; - const auto __flush = [&] + // Add any previously cached char into the matcher and update cache. + const auto __push_class = [&] { - if (__last_char.first) - { - __matcher._M_add_char(__last_char.second); - __last_char.first = false; - } + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); + // We don't cache anything here, just record that the last thing + // processed was a character class (or similar). + __last_char.reset(_BracketState::_Type::_Class); }; if (_M_match_token(_ScannerT::_S_token_collsymbol)) @@ -476,16 +467,16 @@ namespace __detail if (__symbol.size() == 1) __push_char(__symbol[0]); else - __flush(); + __push_class(); } else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) { - __flush(); + __push_class(); __matcher._M_add_equivalence_class(_M_value); } else if (_M_match_token(_ScannerT::_S_token_char_class_name)) { - __flush(); + __push_class(); __matcher._M_add_character_class(_M_value, false); } else if (_M_try_char()) @@ -502,49 +493,50 @@ namespace __detail // It turns out that no one reads BNFs ;) else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) { - if (!__last_char.first) + if (_M_match_token(_ScannerT::_S_token_bracket_end)) { - if (!(_M_flags & regex_constants::ECMAScript)) - { - if (_M_match_token(_ScannerT::_S_token_bracket_end)) - { - __push_char('-'); - return false; - } - __throw_regex_error( - regex_constants::error_range, - "Unexpected dash in bracket expression. For POSIX syntax, " - "a dash is not treated literally only when it is at " - "beginning or end."); - } + // For "-]" the dash is a literal character. __push_char('-'); + return false; } - else + else if (__last_char._M_is_class()) + { + // "\\w-" is invalid, start of range must be a single char. + __throw_regex_error(regex_constants::error_range, + "Invalid start of range in bracket expression."); + } + else if (__last_char._M_is_char()) { if (_M_try_char()) { - __matcher._M_make_range(__last_char.second, _M_value[0]); - __last_char.first = false; + // "x-y" + __matcher._M_make_range(__last_char.get(), _M_value[0]); + __last_char.reset(); } else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) { - __matcher._M_make_range(__last_char.second, '-'); - __last_char.first = false; + // "x--" + __matcher._M_make_range(__last_char.get(), '-'); + __last_char.reset(); } else - { - if (_M_scanner._M_get_token() - != _ScannerT::_S_token_bracket_end) - __throw_regex_error( - regex_constants::error_range, - "Character is expected after a dash."); - __push_char('-'); - } + __throw_regex_error(regex_constants::error_range, + "Invalid end of range in bracket expression."); } + else if (_M_flags & regex_constants::ECMAScript) + { + // A dash that is not part of an existing range. Might be the + // start of a new range, or might just be a literal '-' char. + // Only ECMAScript allows that in the middle of a bracket expr. + __push_char('-'); + } + else + __throw_regex_error(regex_constants::error_range, + "Invalid dash in bracket expression."); } else if (_M_match_token(_ScannerT::_S_token_quoted_class)) { - __flush(); + __push_class(); __matcher._M_add_character_class(_M_value, _M_ctype.is(_CtypeT::upper, _M_value[0])); diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc index 7df70604ea6..0d76e63da7b 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc @@ -69,6 +69,16 @@ test01() void test02() { + VERIFY(regex_match("-", regex("[-]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[--]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[---]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[----]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[-----]", regex_constants::ECMAScript))); + + VERIFY(regex_match("-", regex("[-]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[--]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[---]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[----]", regex_constants::extended))); try { std::regex re("[-----]", std::regex::extended); @@ -78,7 +88,6 @@ test02() { VERIFY(e.code() == std::regex_constants::error_range); } - std::regex re("[-----]", std::regex::ECMAScript); VERIFY(!regex_match("b", regex("[-ac]", regex_constants::extended))); VERIFY(!regex_match("b", regex("[ac-]", regex_constants::extended))); @@ -93,7 +102,27 @@ test02() } catch (const std::regex_error& e) { + VERIFY(e.code() == std::regex_constants::error_range); + } + try + { + regex("[@--]", regex_constants::extended); + VERIFY(false); } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + try + { + regex("[--%]", regex_constants::extended); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + VERIFY(regex_match("].", regex("[][.hyphen.]-0]*", regex_constants::extended))); } @@ -158,6 +187,36 @@ test06() VERIFY(regex_match("a-", debian_cron_namespace_ok)); } +// libstdc++/102447 +void +test07() +{ + VERIFY(regex_match("-", std::regex("[\\w-]", std::regex::ECMAScript))); + VERIFY(regex_match("a", std::regex("[\\w-]", std::regex::ECMAScript))); + VERIFY(regex_match("-", std::regex("[a-]", std::regex::ECMAScript))); + VERIFY(regex_match("a", std::regex("[a-]", std::regex::ECMAScript))); + + try + { + std::regex re("[\\w-a]", std::regex::ECMAScript); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + + try + { + std::regex re("[\\w--]", std::regex::ECMAScript); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } +} + int main() { @@ -167,6 +226,7 @@ main() test04(); test05(); test06(); + test07(); return 0; }