From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2181) id 1475738582A1; Fri, 23 Jun 2023 16:12:27 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 1475738582A1 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1687536747; bh=gPC1hXYrTSXvaj8rzNGxaJ+a22tQ+OIhkwrC/DD0ViI=; h=From:To:Subject:Date:From; b=Lnw8jRCgjWqENZVNYUR6bTNscUYyL72MmrhpfU9epx54G+w8hFEzANLddoHAjwIzO 4AxOm5Ex4CJ8yjrvMFubeQgjOScbbr1x0tdBXBDN6NK7CAfY1dFNxYnBO6dtc7K/Lr 74FTT/GkREvT8NSaq8AdyWcJiu1s2tlzqFGM748g= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Jonathan Wakely To: gcc-cvs@gcc.gnu.org, libstdc++-cvs@gcc.gnu.org Subject: [gcc r10-11461] libstdc++: std::basic_regex should treat '\0' as an ordinary char [PR84110] X-Act-Checkin: gcc X-Git-Author: Jonathan Wakely X-Git-Refname: refs/heads/releases/gcc-10 X-Git-Oldrev: 61dc491950863d0b441ddd3f9d609b83a01deaf7 X-Git-Newrev: d877bf3bdf46b5c996505fc247d170e79fbfa4bf Message-Id: <20230623161227.1475738582A1@sourceware.org> Date: Fri, 23 Jun 2023 16:12:27 +0000 (GMT) List-Id: https://gcc.gnu.org/g:d877bf3bdf46b5c996505fc247d170e79fbfa4bf commit r10-11461-gd877bf3bdf46b5c996505fc247d170e79fbfa4bf Author: Jonathan Wakely Date: Wed Sep 29 13:48:11 2021 +0100 libstdc++: std::basic_regex should treat '\0' as an ordinary char [PR84110] When the input sequence contains a _CharT(0) character, the strchr call in _Scanner<_CharT>::_M_scan_normal() will search for '\0' and so return a pointer to the terminating null at the end of the string. This makes the scanner think it's found a special character. Because it doesn't match any of the actual special characters, we fall off the end of the function (or assert in debug mode). We should check for a null character explicitly and either treat it as an ordinary character (for the ECMAScript grammar) or an error (for all others). I'm not 100% sure that's right, but it seems consistent with the POSIX RE rules where a '\0' means the end of the regex pattern or the end of the sequence being matched. Signed-off-by: Jonathan Wakely libstdc++-v3/ChangeLog: PR libstdc++/84110 * include/bits/regex_error.h (regex_constants::_S_null): New error code for internal use. * include/bits/regex_scanner.tcc (_Scanner::_M_scan_normal()): Check for null character. * testsuite/28_regex/basic_regex/84110.cc: New test. (cherry picked from commit b701e1f8f6870c0f8cb4050674da489101dd05a5) Diff: --- libstdc++-v3/include/bits/regex_error.h | 1 + libstdc++-v3/include/bits/regex_scanner.tcc | 10 ++++++ .../testsuite/28_regex/basic_regex/84110.cc | 39 ++++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/libstdc++-v3/include/bits/regex_error.h b/libstdc++-v3/include/bits/regex_error.h index b40351a39cc..f2899174352 100644 --- a/libstdc++-v3/include/bits/regex_error.h +++ b/libstdc++-v3/include/bits/regex_error.h @@ -61,6 +61,7 @@ namespace regex_constants _S_error_badrepeat, _S_error_complexity, _S_error_stack, + _S_null }; /** The expression contained an invalid collating element name. */ diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index cb8a526ea1a..a5a16af6cec 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -176,6 +176,16 @@ namespace __detail _M_state = _S_state_in_brace; _M_token = _S_token_interval_begin; } + else if (__builtin_expect(__c == _CharT(0), false)) + { + if (!_M_is_ecma()) + { + __throw_regex_error(regex_constants::_S_null, + "Unexpected null character in regular expression"); + } + _M_token = _S_token_ord_char; + _M_value.assign(1, __c); + } else if (__c != ']' && __c != '}') { auto __it = _M_token_tbl; diff --git a/libstdc++-v3/testsuite/28_regex/basic_regex/84110.cc b/libstdc++-v3/testsuite/28_regex/basic_regex/84110.cc new file mode 100644 index 00000000000..b9971dcaac5 --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/basic_regex/84110.cc @@ -0,0 +1,39 @@ +// { dg-do run { target c++11 } } +#include +#include +#include + +void test01() +{ + const std::string s(1ul, '\0'); + std::regex re(s); + VERIFY( std::regex_match(s, re) ); // PR libstdc++/84110 + +#if __cpp_exceptions + using namespace std::regex_constants; + for (auto syn : {basic, extended, awk, grep, egrep}) + { + try + { + std::regex{s, syn}; // '\0' is not valid for other grammars + VERIFY( false ); + } + catch (const std::regex_error&) + { + } + } +#endif +} + +void test02() +{ + const std::string s("uh-\0h", 5); + std::regex re(s); + VERIFY( std::regex_match(s, re) ); +} + +int main() +{ + test01(); + test02(); +}