From: Jonathan Wakely <jwakely@redhat.com>
To: libstdc++@gcc.gnu.org, gcc-patches@gcc.gnu.org
Subject: [committed] libstdc++: Fix handling of invalid ranges in std::regex [PR102447]
Date: Tue, 14 Dec 2021 21:47:53 +0000 [thread overview]
Message-ID: <20211214214753.2133124-1-jwakely@redhat.com> (raw)
Tested powerpc64le-linux, pushed to trunk.
std::regex currently allows invalid bracket ranges such as [\w-a] which
are only allowed by ECMAScript when in web browser compatibility mode.
It should be an error, because the start of the range is a character
class, not a single character. The current implementation of
_Compiler::_M_expression_term does not provide a way to reject this,
because we only remember a previous character, not whether we just
processed a character class (or collating symbol etc.)
This patch replaces the pair<bool, CharT> used to emulate
optional<CharT> with a custom class closer to pair<tribool,CharT>. That
allows us to track three states, so that we can tell when we've just
seen a character class.
With this additional state the code in _M_expression_term for processing
the _S_token_bracket_dash can be improved to correctly reject the [\w-a]
case, without regressing for valid cases such as [\w-] and [----].
libstdc++-v3/ChangeLog:
PR libstdc++/102447
* include/bits/regex_compiler.h (_Compiler::_BracketState): New
class.
(_Compiler::_BrackeyMatcher): New alias template.
(_Compiler::_M_expression_term): Change pair<bool, CharT>
parameter to _BracketState. Process first character for
ECMAScript syntax as well as POSIX.
* include/bits/regex_compiler.tcc
(_Compiler::_M_insert_bracket_matcher): Pass _BracketState.
(_Compiler::_M_expression_term): Use _BracketState to store
state between calls. Improve handling of dashes in ranges.
* testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc:
Add more tests for ranges containing dashes. Check invalid
ranges with character class at the beginning.
---
libstdc++-v3/include/bits/regex_compiler.h | 40 +++++-
libstdc++-v3/include/bits/regex_compiler.tcc | 118 ++++++++----------
.../regex_match/cstring_bracket_01.cc | 62 ++++++++-
3 files changed, 152 insertions(+), 68 deletions(-)
diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h
index 88c60c2bed7..06cb48f2b6d 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -121,13 +121,45 @@ namespace __detail
void
_M_insert_bracket_matcher(bool __neg);
- // Returns true if successfully matched one term and should continue.
+ // Cache of the last atom seen in a bracketed range expression.
+ struct _BracketState
+ {
+ enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None;
+ _CharT _M_char;
+
+ void
+ set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; }
+
+ _GLIBCXX_NODISCARD _CharT
+ get() const noexcept { return _M_char; }
+
+ void
+ reset(_Type __t = _Type::_None) noexcept { _M_type = __t; }
+
+ explicit operator bool() const noexcept
+ { return _M_type != _Type::_None; }
+
+ // Previous token was a single character.
+ _GLIBCXX_NODISCARD bool
+ _M_is_char() const noexcept { return _M_type == _Type::_Char; }
+
+ // Previous token was a character class, equivalent class,
+ // collating symbol etc.
+ _GLIBCXX_NODISCARD bool
+ _M_is_class() const noexcept { return _M_type == _Type::_Class; }
+ };
+
+ template<bool __icase, bool __collate>
+ using _BracketMatcher
+ = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>;
+
+ // Returns true if successfully parsed one term and should continue
+ // compiling a bracket expression.
// Returns false if the compiler should move on.
template<bool __icase, bool __collate>
bool
- _M_expression_term(pair<bool, _CharT>& __last_char,
- _BracketMatcher<_TraitsT, __icase, __collate>&
- __matcher);
+ _M_expression_term(_BracketState& __last_char,
+ _BracketMatcher<__icase, __collate>& __matcher);
int
_M_cur_int_value(int __radix);
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index 0e2e1321376..9000aec8e25 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -403,7 +403,7 @@ namespace __detail
_M_insert_character_class_matcher()
{
__glibcxx_assert(_M_value.size() == 1);
- _BracketMatcher<_TraitsT, __icase, __collate> __matcher
+ _BracketMatcher<__icase, __collate> __matcher
(_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
__matcher._M_add_character_class(_M_value, false);
__matcher._M_ready();
@@ -417,26 +417,17 @@ namespace __detail
_Compiler<_TraitsT>::
_M_insert_bracket_matcher(bool __neg)
{
- _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
- pair<bool, _CharT> __last_char; // Optional<_CharT>
- __last_char.first = false;
- if (!(_M_flags & regex_constants::ECMAScript))
- {
- if (_M_try_char())
- {
- __last_char.first = true;
- __last_char.second = _M_value[0];
- }
- else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
- {
- __last_char.first = true;
- __last_char.second = '-';
- }
- }
+ _BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits);
+ _BracketState __last_char;
+ if (_M_try_char())
+ __last_char.set(_M_value[0]);
+ else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
+ // Dash as first character is a normal character.
+ __last_char.set('-');
while (_M_expression_term(__last_char, __matcher))
;
- if (__last_char.first)
- __matcher._M_add_char(__last_char.second);
+ if (__last_char._M_is_char())
+ __matcher._M_add_char(__last_char.get());
__matcher._M_ready();
_M_stack.push(_StateSeqT(
*_M_nfa,
@@ -447,27 +438,27 @@ namespace __detail
template<bool __icase, bool __collate>
bool
_Compiler<_TraitsT>::
- _M_expression_term(pair<bool, _CharT>& __last_char,
- _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
+ _M_expression_term(_BracketState& __last_char,
+ _BracketMatcher<__icase, __collate>& __matcher)
{
if (_M_match_token(_ScannerT::_S_token_bracket_end))
return false;
+ // Add any previously cached char into the matcher and update cache.
const auto __push_char = [&](_CharT __ch)
{
- if (__last_char.first)
- __matcher._M_add_char(__last_char.second);
- else
- __last_char.first = true;
- __last_char.second = __ch;
+ if (__last_char._M_is_char())
+ __matcher._M_add_char(__last_char.get());
+ __last_char.set(__ch);
};
- const auto __flush = [&]
+ // Add any previously cached char into the matcher and update cache.
+ const auto __push_class = [&]
{
- if (__last_char.first)
- {
- __matcher._M_add_char(__last_char.second);
- __last_char.first = false;
- }
+ if (__last_char._M_is_char())
+ __matcher._M_add_char(__last_char.get());
+ // We don't cache anything here, just record that the last thing
+ // processed was a character class (or similar).
+ __last_char.reset(_BracketState::_Type::_Class);
};
if (_M_match_token(_ScannerT::_S_token_collsymbol))
@@ -476,16 +467,16 @@ namespace __detail
if (__symbol.size() == 1)
__push_char(__symbol[0]);
else
- __flush();
+ __push_class();
}
else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
{
- __flush();
+ __push_class();
__matcher._M_add_equivalence_class(_M_value);
}
else if (_M_match_token(_ScannerT::_S_token_char_class_name))
{
- __flush();
+ __push_class();
__matcher._M_add_character_class(_M_value, false);
}
else if (_M_try_char())
@@ -502,49 +493,50 @@ namespace __detail
// It turns out that no one reads BNFs ;)
else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
{
- if (!__last_char.first)
+ if (_M_match_token(_ScannerT::_S_token_bracket_end))
{
- if (!(_M_flags & regex_constants::ECMAScript))
- {
- if (_M_match_token(_ScannerT::_S_token_bracket_end))
- {
- __push_char('-');
- return false;
- }
- __throw_regex_error(
- regex_constants::error_range,
- "Unexpected dash in bracket expression. For POSIX syntax, "
- "a dash is not treated literally only when it is at "
- "beginning or end.");
- }
+ // For "-]" the dash is a literal character.
__push_char('-');
+ return false;
}
- else
+ else if (__last_char._M_is_class())
+ {
+ // "\\w-" is invalid, start of range must be a single char.
+ __throw_regex_error(regex_constants::error_range,
+ "Invalid start of range in bracket expression.");
+ }
+ else if (__last_char._M_is_char())
{
if (_M_try_char())
{
- __matcher._M_make_range(__last_char.second, _M_value[0]);
- __last_char.first = false;
+ // "x-y"
+ __matcher._M_make_range(__last_char.get(), _M_value[0]);
+ __last_char.reset();
}
else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
{
- __matcher._M_make_range(__last_char.second, '-');
- __last_char.first = false;
+ // "x--"
+ __matcher._M_make_range(__last_char.get(), '-');
+ __last_char.reset();
}
else
- {
- if (_M_scanner._M_get_token()
- != _ScannerT::_S_token_bracket_end)
- __throw_regex_error(
- regex_constants::error_range,
- "Character is expected after a dash.");
- __push_char('-');
- }
+ __throw_regex_error(regex_constants::error_range,
+ "Invalid end of range in bracket expression.");
}
+ else if (_M_flags & regex_constants::ECMAScript)
+ {
+ // A dash that is not part of an existing range. Might be the
+ // start of a new range, or might just be a literal '-' char.
+ // Only ECMAScript allows that in the middle of a bracket expr.
+ __push_char('-');
+ }
+ else
+ __throw_regex_error(regex_constants::error_range,
+ "Invalid dash in bracket expression.");
}
else if (_M_match_token(_ScannerT::_S_token_quoted_class))
{
- __flush();
+ __push_class();
__matcher._M_add_character_class(_M_value,
_M_ctype.is(_CtypeT::upper,
_M_value[0]));
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc
index 7df70604ea6..0d76e63da7b 100644
--- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc
@@ -69,6 +69,16 @@ test01()
void
test02()
{
+ VERIFY(regex_match("-", regex("[-]", regex_constants::ECMAScript)));
+ VERIFY(regex_match("-", regex("[--]", regex_constants::ECMAScript)));
+ VERIFY(regex_match("-", regex("[---]", regex_constants::ECMAScript)));
+ VERIFY(regex_match("-", regex("[----]", regex_constants::ECMAScript)));
+ VERIFY(regex_match("-", regex("[-----]", regex_constants::ECMAScript)));
+
+ VERIFY(regex_match("-", regex("[-]", regex_constants::extended)));
+ VERIFY(regex_match("-", regex("[--]", regex_constants::extended)));
+ VERIFY(regex_match("-", regex("[---]", regex_constants::extended)));
+ VERIFY(regex_match("-", regex("[----]", regex_constants::extended)));
try
{
std::regex re("[-----]", std::regex::extended);
@@ -78,7 +88,6 @@ test02()
{
VERIFY(e.code() == std::regex_constants::error_range);
}
- std::regex re("[-----]", std::regex::ECMAScript);
VERIFY(!regex_match("b", regex("[-ac]", regex_constants::extended)));
VERIFY(!regex_match("b", regex("[ac-]", regex_constants::extended)));
@@ -93,7 +102,27 @@ test02()
}
catch (const std::regex_error& e)
{
+ VERIFY(e.code() == std::regex_constants::error_range);
}
+ try
+ {
+ regex("[@--]", regex_constants::extended);
+ VERIFY(false);
+ }
+ catch (const std::regex_error& e)
+ {
+ VERIFY(e.code() == std::regex_constants::error_range);
+ }
+ try
+ {
+ regex("[--%]", regex_constants::extended);
+ VERIFY(false);
+ }
+ catch (const std::regex_error& e)
+ {
+ VERIFY(e.code() == std::regex_constants::error_range);
+ }
+
VERIFY(regex_match("].", regex("[][.hyphen.]-0]*", regex_constants::extended)));
}
@@ -158,6 +187,36 @@ test06()
VERIFY(regex_match("a-", debian_cron_namespace_ok));
}
+// libstdc++/102447
+void
+test07()
+{
+ VERIFY(regex_match("-", std::regex("[\\w-]", std::regex::ECMAScript)));
+ VERIFY(regex_match("a", std::regex("[\\w-]", std::regex::ECMAScript)));
+ VERIFY(regex_match("-", std::regex("[a-]", std::regex::ECMAScript)));
+ VERIFY(regex_match("a", std::regex("[a-]", std::regex::ECMAScript)));
+
+ try
+ {
+ std::regex re("[\\w-a]", std::regex::ECMAScript);
+ VERIFY(false);
+ }
+ catch (const std::regex_error& e)
+ {
+ VERIFY(e.code() == std::regex_constants::error_range);
+ }
+
+ try
+ {
+ std::regex re("[\\w--]", std::regex::ECMAScript);
+ VERIFY(false);
+ }
+ catch (const std::regex_error& e)
+ {
+ VERIFY(e.code() == std::regex_constants::error_range);
+ }
+}
+
int
main()
{
@@ -167,6 +226,7 @@ main()
test04();
test05();
test06();
+ test07();
return 0;
}
--
2.31.1
reply other threads:[~2021-12-14 21:47 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211214214753.2133124-1-jwakely@redhat.com \
--to=jwakely@redhat.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=libstdc++@gcc.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).