* [Patch 2/2] Localization problem in regex @ 2013-08-23 9:25 Tim Shen 2013-08-23 10:15 ` Jonathan Wakely ` (2 more replies) 0 siblings, 3 replies; 13+ messages in thread From: Tim Shen @ 2013-08-23 9:25 UTC (permalink / raw) To: libstdc++, gcc-patches [-- Attachment #1: Type: text/plain, Size: 101 bytes --] Inspired by this mail: http://gcc.gnu.org/ml/libstdc++/2013-08/msg00131.html Thanks! -- Tim Shen [-- Attachment #2: regex-locale.patch --] [-- Type: application/octet-stream, Size: 5533 bytes --] commit 62121ad0480af3d9abe0b0ece9bc636dabd2b6c2 Author: tim <timshen91@gmail.com> Date: Thu Aug 22 17:28:51 2013 +0800 2013-08-23 Tim Shen <timshen91@gmail.com> * include/bits/regex.h: Fix callers. * include/bits/regex_compiler.h: Store _Traits reference. * include/bits/regex_executor.h: Add _Traits member for _DFSExecutor. * include/bits/regex_executor.tcc: Likewise. * testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc: New. diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 4838819..19d9bbd 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -880,8 +880,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION assign(const basic_string<_Ch_type, _Ch_typeraits, _Alloc>& __s, flag_type __flags = ECMAScript) { - basic_regex __tmp(__s, __flags); - this->swap(__tmp); + _M_automaton = + __detail::_Compiler<decltype(__s.begin()), _Ch_type, _Rx_traits> + (__s.begin(), __s.end(), _M_traits, _M_flags)._M_get_nfa(); return *this; } diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 1d588b9..a1107bb 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -214,7 +214,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return _M_traits.transform(__s.begin(), __s.end()); } - _TraitsT _M_traits; + const _TraitsT& _M_traits; _FlagT _M_flags; bool _M_is_non_matching; std::vector<_CharT> _M_char_set; diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 23998ed..6d66d88 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -120,13 +120,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef typename _BaseT::_ResultsVec _ResultsVec; typedef regex_constants::match_flag_type _FlagT; - _DFSExecutor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - const _RegexT& __nfa, - _FlagT __flags) + _DFSExecutor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + const _RegexT& __nfa, + const _TraitsT& __traits, + _FlagT __flags) : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_traits(_TraitsT()), _M_nfa(__nfa), _M_results_ret(this->_M_results) + _M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results) { } void @@ -142,9 +143,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_dfs(_StateIdT __i); - _ResultsVec _M_results_ret; - _TraitsT _M_traits; - const _RegexT& _M_nfa; + _ResultsVec _M_results_ret; + const _TraitsT& _M_traits; + const _RegexT& _M_nfa; }; // Like the DFS approach, it try every possible state transition; Unlike DFS, diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index edfd0b6..788d65e 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -320,7 +320,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>> (__re._M_automaton); if (__p->_M_has_backref) - return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags)); + return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, + __re._M_traits, __flags)); return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags)); } diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc new file mode 100644 index 0000000..a7225cb --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc @@ -0,0 +1,48 @@ +// { dg-options "-std=gnu++11" } +// { dg-require-namedlocale "de_DE.UTF-8" } + +// +// 2013-08-23 Tim Shen <timshen91@gmail.com> +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 28.11.2 regex_match +// Tests Extended localization against a wide-string. + +#include <regex> +#include <testsuite_hooks.h> + +void +test01() +{ + bool test __attribute__((unused)) = true; + + std::wstring str2 = L"ÜBER"; + std::wregex re2; + re2.imbue(std::locale("de_DE.UTF-8")); + re2.assign(L"[[:upper:]]*", std::regex::extended); + std::wsmatch m2; + VERIFY(std::regex_match(str2, m2, re2)); +} + +int +main() +{ + test01(); + return 0; +} ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 9:25 [Patch 2/2] Localization problem in regex Tim Shen @ 2013-08-23 10:15 ` Jonathan Wakely 2013-08-23 11:08 ` Paolo Carlini 2013-08-23 19:04 ` Stefan Schweter 2 siblings, 0 replies; 13+ messages in thread From: Jonathan Wakely @ 2013-08-23 10:15 UTC (permalink / raw) To: Tim Shen; +Cc: libstdc++, gcc-patches On 23 August 2013 10:17, Tim Shen wrote: > Inspired by this mail: http://gcc.gnu.org/ml/libstdc++/2013-08/msg00131.html This is OK too, thanks for the quick fix to the locale issue! ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 9:25 [Patch 2/2] Localization problem in regex Tim Shen 2013-08-23 10:15 ` Jonathan Wakely @ 2013-08-23 11:08 ` Paolo Carlini 2013-08-23 14:04 ` Tim Shen 2013-08-23 19:04 ` Stefan Schweter 2 siblings, 1 reply; 13+ messages in thread From: Paolo Carlini @ 2013-08-23 11:08 UTC (permalink / raw) To: Tim Shen; +Cc: libstdc++, gcc-patches Hi, On 08/23/2013 11:17 AM, Tim Shen wrote: > 2013-08-23 Tim Shen<timshen91@gmail.com> > > * include/bits/regex.h: Fix callers. > * include/bits/regex_compiler.h: Store _Traits reference. > * include/bits/regex_executor.h: Add _Traits member for _DFSExecutor. > * include/bits/regex_executor.tcc: Likewise. > * testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc: > New. Thanks a lot indeed. But: "Fix callers" of *what*?!? And from *where*? To be really honest this kind of ChangeLog entry doesn't make much sense. Please get used to list the specific functions you are changing. It's important. Thanks, Paolo. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 11:08 ` Paolo Carlini @ 2013-08-23 14:04 ` Tim Shen 2013-08-23 14:27 ` Paolo Carlini 0 siblings, 1 reply; 13+ messages in thread From: Tim Shen @ 2013-08-23 14:04 UTC (permalink / raw) To: Paolo Carlini; +Cc: libstdc++, gcc-patches [-- Attachment #1: Type: text/plain, Size: 391 bytes --] On Fri, Aug 23, 2013 at 6:29 PM, Paolo Carlini <paolo.carlini@oracle.com> wrote: > Thanks a lot indeed. But: "Fix callers" of *what*?!? And from *where*? To be > really honest this kind of ChangeLog entry doesn't make much sense. Please > get used to list the specific functions you are changing. It's important. I see. Actually I find a mistake by this review /w\, thanks! -- Tim Shen [-- Attachment #2: regex-locale.patch --] [-- Type: application/octet-stream, Size: 5626 bytes --] commit 962a57c32c1928c021ec0eb837b9a19f8a7fd3f3 Author: tim <timshen91@gmail.com> Date: Thu Aug 22 17:28:51 2013 +0800 2013-08-23 Tim Shen <timshen91@gmail.com> * include/bits/regex.h: Let basic_regex::assign not lose _M_traits. * include/bits/regex_compiler.h: Store _Compiler::_M_traits by reference instead of by value. * include/bits/regex_executor.h: Add _M_traits to _DFSExecutor. * include/bits/regex_executor.tcc: Likewise. * testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc: New. diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 4838819..40db517 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -880,8 +880,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION assign(const basic_string<_Ch_type, _Ch_typeraits, _Alloc>& __s, flag_type __flags = ECMAScript) { - basic_regex __tmp(__s, __flags); - this->swap(__tmp); + _M_flags = __flags; + _M_automaton = + __detail::_Compiler<decltype(__s.begin()), _Ch_type, _Rx_traits> + (__s.begin(), __s.end(), _M_traits, _M_flags)._M_get_nfa(); return *this; } diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 1d588b9..a1107bb 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -214,7 +214,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return _M_traits.transform(__s.begin(), __s.end()); } - _TraitsT _M_traits; + const _TraitsT& _M_traits; _FlagT _M_flags; bool _M_is_non_matching; std::vector<_CharT> _M_char_set; diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 23998ed..6d66d88 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -120,13 +120,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef typename _BaseT::_ResultsVec _ResultsVec; typedef regex_constants::match_flag_type _FlagT; - _DFSExecutor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - const _RegexT& __nfa, - _FlagT __flags) + _DFSExecutor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + const _RegexT& __nfa, + const _TraitsT& __traits, + _FlagT __flags) : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_traits(_TraitsT()), _M_nfa(__nfa), _M_results_ret(this->_M_results) + _M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results) { } void @@ -142,9 +143,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_dfs(_StateIdT __i); - _ResultsVec _M_results_ret; - _TraitsT _M_traits; - const _RegexT& _M_nfa; + _ResultsVec _M_results_ret; + const _TraitsT& _M_traits; + const _RegexT& _M_nfa; }; // Like the DFS approach, it try every possible state transition; Unlike DFS, diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index edfd0b6..788d65e 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -320,7 +320,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>> (__re._M_automaton); if (__p->_M_has_backref) - return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags)); + return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, + __re._M_traits, __flags)); return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags)); } diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc new file mode 100644 index 0000000..a7225cb --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc @@ -0,0 +1,48 @@ +// { dg-options "-std=gnu++11" } +// { dg-require-namedlocale "de_DE.UTF-8" } + +// +// 2013-08-23 Tim Shen <timshen91@gmail.com> +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 28.11.2 regex_match +// Tests Extended localization against a wide-string. + +#include <regex> +#include <testsuite_hooks.h> + +void +test01() +{ + bool test __attribute__((unused)) = true; + + std::wstring str2 = L"ÜBER"; + std::wregex re2; + re2.imbue(std::locale("de_DE.UTF-8")); + re2.assign(L"[[:upper:]]*", std::regex::extended); + std::wsmatch m2; + VERIFY(std::regex_match(str2, m2, re2)); +} + +int +main() +{ + test01(); + return 0; +} ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 14:04 ` Tim Shen @ 2013-08-23 14:27 ` Paolo Carlini 2013-08-23 14:57 ` Tim Shen 0 siblings, 1 reply; 13+ messages in thread From: Paolo Carlini @ 2013-08-23 14:27 UTC (permalink / raw) To: Tim Shen; +Cc: libstdc++, gcc-patches On 08/23/2013 03:59 PM, Tim Shen wrote: > On Fri, Aug 23, 2013 at 6:29 PM, Paolo Carlini <paolo.carlini@oracle.com> wrote: >> Thanks a lot indeed. But: "Fix callers" of *what*?!? And from *where*? To be >> really honest this kind of ChangeLog entry doesn't make much sense. Please >> get used to list the specific functions you are changing. It's important. > I see. > > Actually I find a mistake by this review /w\, thanks! Great. But please but the names of the functions you are changing between round brackets. You have tons of examples everywhere. Paolo. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 14:27 ` Paolo Carlini @ 2013-08-23 14:57 ` Tim Shen 2013-08-23 15:02 ` Jonathan Wakely 2013-08-23 15:05 ` Paolo Carlini 0 siblings, 2 replies; 13+ messages in thread From: Tim Shen @ 2013-08-23 14:57 UTC (permalink / raw) To: Paolo Carlini; +Cc: libstdc++, gcc-patches [-- Attachment #1: Type: text/plain, Size: 244 bytes --] On Fri, Aug 23, 2013 at 10:26 PM, Paolo Carlini <paolo.carlini@oracle.com> wrote: > Great. But please but the names of the functions you are changing between > round brackets. You have tons of examples everywhere. ...like this? -- Tim Shen [-- Attachment #2: changelog --] [-- Type: application/octet-stream, Size: 406 bytes --] 2013-08-23 Tim Shen <timshen91@gmail.com> * include/bits/regex.h (basic_regex<>::assign): Don't lose _M_traits. * include/bits/regex_compiler.h: Store _Compiler::_M_traits by reference instead of by value. * include/bits/regex_executor.h: Add _M_traits to _DFSExecutor. * include/bits/regex_executor.tcc: Likewise. * testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc: New. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 14:57 ` Tim Shen @ 2013-08-23 15:02 ` Jonathan Wakely 2013-08-23 15:05 ` Paolo Carlini 1 sibling, 0 replies; 13+ messages in thread From: Jonathan Wakely @ 2013-08-23 15:02 UTC (permalink / raw) To: Tim Shen; +Cc: Paolo Carlini, libstdc++, gcc-patches On 23 August 2013 15:53, Tim Shen wrote: > On Fri, Aug 23, 2013 at 10:26 PM, Paolo Carlini > <paolo.carlini@oracle.com> wrote: >> Great. But please but the names of the functions you are changing between >> round brackets. You have tons of examples everywhere. > > ...like this? Yes, that makes it clear what part of the file is affected. That's important when searching ChangeLogs to find what change last touched e.g. basic_regex::assign. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 14:57 ` Tim Shen 2013-08-23 15:02 ` Jonathan Wakely @ 2013-08-23 15:05 ` Paolo Carlini 2013-08-23 15:48 ` Tim Shen 1 sibling, 1 reply; 13+ messages in thread From: Paolo Carlini @ 2013-08-23 15:05 UTC (permalink / raw) To: Tim Shen; +Cc: libstdc++, gcc-patches On 08/23/2013 04:53 PM, Tim Shen wrote: > On Fri, Aug 23, 2013 at 10:26 PM, Paolo Carlini > <paolo.carlini@oracle.com> wrote: >> Great. But please but the names of the functions you are changing between >> round brackets. You have tons of examples everywhere. > ...like this? The assign change is fine. But I would also put between round brackets (class _Compiler<>) for the _M_traits change and use something like (_DFSExecutor<>::_DFSExecutor): Adjust. for the last change (remember that constructors are special member *functions* thus should be normally listed). Paolo. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 15:05 ` Paolo Carlini @ 2013-08-23 15:48 ` Tim Shen 2013-08-23 16:20 ` Jonathan Wakely 0 siblings, 1 reply; 13+ messages in thread From: Tim Shen @ 2013-08-23 15:48 UTC (permalink / raw) To: Paolo Carlini, Jonathan Wakely; +Cc: libstdc++, gcc-patches [-- Attachment #1: Type: text/plain, Size: 879 bytes --] On Fri, Aug 23, 2013 at 11:03 PM, Paolo Carlini <paolo.carlini@oracle.com> wrote: > The assign change is fine. But I would also put between round brackets > (class _Compiler<>) for the _M_traits change and use something like > (_DFSExecutor<>::_DFSExecutor): Adjust. for the last change (remember that > constructors are special member *functions* thus should be normally listed). I see. On Fri, Aug 23, 2013 at 10:57 PM, Jonathan Wakely <jwakely.gcc@gmail.com> wrote: > Yes, that makes it clear what part of the file is affected. That's > important when searching ChangeLogs to find what change last touched > e.g. basic_regex::assign. Why don't we `svn blame`? By the way, do we seariously need to keep ChangeLog in files? Is it more like a tradition than a solution? After all, we have SVN or Git now. I remember a mail metioned this but cannot find it >.< -- Tim Shen [-- Attachment #2: changelog --] [-- Type: application/octet-stream, Size: 488 bytes --] 2013-08-23 Tim Shen <timshen91@gmail.com> * include/bits/regex.h (basic_regex<>::assign): Don't lose _M_traits. * include/bits/regex_compiler.h: Store _Compiler::_M_traits by reference instead of by value. * include/bits/regex_executor.h (_DFSExecutor<>::_DFSExecutor): Add _M_traits to _DFSExecutor. * include/bits/regex_executor.tcc (__get_executor<>): Pass _M_traits to _DFSExecutor too. * testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc: New. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 15:48 ` Tim Shen @ 2013-08-23 16:20 ` Jonathan Wakely 0 siblings, 0 replies; 13+ messages in thread From: Jonathan Wakely @ 2013-08-23 16:20 UTC (permalink / raw) To: Tim Shen; +Cc: Paolo Carlini, libstdc++, gcc-patches On 23 August 2013 16:48, Tim Shen wrote: > By the way, do we seariously need to keep ChangeLog in files? Is it > more like a tradition than a solution? After all, we have SVN or Git > now. I remember a mail metioned this but cannot find it >.< It's not a tradition, they are required, see http://www.gnu.org/prep/standards/standards.html#Change-Logs ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 9:25 [Patch 2/2] Localization problem in regex Tim Shen 2013-08-23 10:15 ` Jonathan Wakely 2013-08-23 11:08 ` Paolo Carlini @ 2013-08-23 19:04 ` Stefan Schweter 2013-08-24 6:15 ` Tim Shen 2 siblings, 1 reply; 13+ messages in thread From: Stefan Schweter @ 2013-08-23 19:04 UTC (permalink / raw) To: Tim Shen; +Cc: libstdc++, gcc-patches Am Fri, 23 Aug 2013 17:17:41 +0800 schrieb Tim Shen <timshen91@gmail.com>: > Inspired by this mail: > http://gcc.gnu.org/ml/libstdc++/2013-08/msg00131.html > > Thanks! > > Hi Tim, I applied the patch + compiled everything - it's working now! Thanks! But... I found a problem with .imbue(): int main() { std::setlocale(LC_ALL, ""); std::wstring str2 = L"öäü"; std::wregex re2; re2.imbue(std::locale("")); re2.assign(L"([[:lower:]]+)"); std::wsmatch m2; std::wsregex_token_iterator end {}; for (std::wsregex_token_iterator p{str2.begin(), str2.end(), re2, {1}}; p != end; ++p) { std::wcout << *p << std::endl; } return 0; } Works fine! But when there's no imbue() call, a Segmentation fault occurs. int main() { std::setlocale(LC_ALL, ""); std::wstring str2 = L"öäü"; std::wregex re2; //re2.imbue(std::locale("")); re2.assign(L"([[:lower:]]+)"); std::wsmatch m2; std::wsregex_token_iterator end {}; for (std::wsregex_token_iterator p{str2.begin(), str2.end(), re2, {1}}; p != end; ++p) { std::wcout << *p << std::endl; } return 0; } May be it's better to throw an exception here? Thanks in advance, Stefan ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-23 19:04 ` Stefan Schweter @ 2013-08-24 6:15 ` Tim Shen 2013-08-29 18:45 ` Tim Shen 0 siblings, 1 reply; 13+ messages in thread From: Tim Shen @ 2013-08-24 6:15 UTC (permalink / raw) To: Stefan Schweter; +Cc: libstdc++, gcc-patches [-- Attachment #1: Type: text/plain, Size: 1661 bytes --] On Sat, Aug 24, 2013 at 1:51 AM, Stefan Schweter <stefan@schweter.it> wrote: > Am Fri, 23 Aug 2013 17:17:41 +0800 > schrieb Tim Shen <timshen91@gmail.com>: > >> Inspired by this mail: >> http://gcc.gnu.org/ml/libstdc++/2013-08/msg00131.html >> >> Thanks! >> >> > > Hi Tim, > > I applied the patch + compiled everything - it's working now! Thanks! > > But... I found a problem with .imbue(): > > int main() > { > std::setlocale(LC_ALL, ""); > > std::wstring str2 = L"öäü"; > std::wregex re2; > re2.imbue(std::locale("")); > re2.assign(L"([[:lower:]]+)"); > std::wsmatch m2; > > std::wsregex_token_iterator end {}; > > for (std::wsregex_token_iterator p{str2.begin(), str2.end(), > re2, {1}}; p != end; ++p) { std::wcout << *p << std::endl; > } > > return 0; > } > > Works fine! But when there's no imbue() call, a Segmentation > fault occurs. > > int main() > { > std::setlocale(LC_ALL, ""); > > std::wstring str2 = L"öäü"; > std::wregex re2; > //re2.imbue(std::locale("")); > re2.assign(L"([[:lower:]]+)"); > std::wsmatch m2; > > std::wsregex_token_iterator end {}; > > for (std::wsregex_token_iterator p{str2.begin(), str2.end(), > re2, {1}}; p != end; ++p) { std::wcout << *p << std::endl; > } > > return 0; > } > > May be it's better to throw an exception here? Thanks in advance, > > Stefan Fixed. It's not fully tested. I just run the 28_regex testcases. I'll do a full test before committing. Thanks again! -- Tim Shen [-- Attachment #2: regex-locale.patch --] [-- Type: application/octet-stream, Size: 8638 bytes --] commit 7941b4ae05f205ebd65ba1f8420a8eb9120c5bce Author: tim <timshen91@gmail.com> Date: Thu Aug 22 17:28:51 2013 +0800 2013-08-23 Tim Shen <timshen91@gmail.com> * include/bits/regex.h (basic_regex<>::assign): Don't lose _M_traits. (regex_iterator<>::regex_iterator): Return nullptr when regex_search failed. (regex_token_iterator<>::_M_end_of_seq): Should be defined true when _M_result is(not isn't) nullptr. * include/bits/regex_compiler.h: Store _Compiler::_M_traits by reference instead of by value. * include/bits/regex_executor.h (_DFSExecutor<>::_DFSExecutor): Add _M_traits to _DFSExecutor. * include/bits/regex_executor.tcc (__get_executor<>): Pass _M_traits to _DFSExecutor too. * testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc: New. * testsuite/28_regex/iterators/regex_token_iterator/wchar_t/ wstring_02.cc: New. diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 4838819..412465a 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -880,8 +880,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION assign(const basic_string<_Ch_type, _Ch_typeraits, _Alloc>& __s, flag_type __flags = ECMAScript) { - basic_regex __tmp(__s, __flags); - this->swap(__tmp); + _M_flags = __flags; + _M_automaton = + __detail::_Compiler<decltype(__s.begin()), _Ch_type, _Rx_traits> + (__s.begin(), __s.end(), _M_traits, _M_flags)._M_get_nfa(); return *this; } @@ -2591,7 +2593,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION regex_constants::match_flag_type __m = regex_constants::match_default) : _M_begin(__a), _M_end(__b), _M_pregex(&__re), _M_flags(__m), _M_match() - { regex_search(_M_begin, _M_end, _M_match, *_M_pregex, _M_flags); } + { + if (!regex_search(_M_begin, _M_end, _M_match, *_M_pregex, _M_flags)) + *this = regex_iterator(); + } /** * Copy constructs a %regex_iterator. @@ -2905,9 +2910,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return (*_M_position)[_M_subs[_M_n]]; } - bool - _M_end_of_seq() const - { return _M_result != nullptr; } + constexpr bool + _M_end_of_seq() + { return _M_result == nullptr; } _Position _M_position; const value_type* _M_result; diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 1d588b9..a1107bb 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -214,7 +214,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return _M_traits.transform(__s.begin(), __s.end()); } - _TraitsT _M_traits; + const _TraitsT& _M_traits; _FlagT _M_flags; bool _M_is_non_matching; std::vector<_CharT> _M_char_set; diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 23998ed..6d66d88 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -120,13 +120,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef typename _BaseT::_ResultsVec _ResultsVec; typedef regex_constants::match_flag_type _FlagT; - _DFSExecutor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - const _RegexT& __nfa, - _FlagT __flags) + _DFSExecutor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + const _RegexT& __nfa, + const _TraitsT& __traits, + _FlagT __flags) : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_traits(_TraitsT()), _M_nfa(__nfa), _M_results_ret(this->_M_results) + _M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results) { } void @@ -142,9 +143,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_dfs(_StateIdT __i); - _ResultsVec _M_results_ret; - _TraitsT _M_traits; - const _RegexT& _M_nfa; + _ResultsVec _M_results_ret; + const _TraitsT& _M_traits; + const _RegexT& _M_nfa; }; // Like the DFS approach, it try every possible state transition; Unlike DFS, diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index edfd0b6..788d65e 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -320,7 +320,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>> (__re._M_automaton); if (__p->_M_has_backref) - return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags)); + return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, + __re._M_traits, __flags)); return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags)); } diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc new file mode 100644 index 0000000..a7225cb --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc @@ -0,0 +1,48 @@ +// { dg-options "-std=gnu++11" } +// { dg-require-namedlocale "de_DE.UTF-8" } + +// +// 2013-08-23 Tim Shen <timshen91@gmail.com> +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 28.11.2 regex_match +// Tests Extended localization against a wide-string. + +#include <regex> +#include <testsuite_hooks.h> + +void +test01() +{ + bool test __attribute__((unused)) = true; + + std::wstring str2 = L"ÜBER"; + std::wregex re2; + re2.imbue(std::locale("de_DE.UTF-8")); + re2.assign(L"[[:upper:]]*", std::regex::extended); + std::wsmatch m2; + VERIFY(std::regex_match(str2, m2, re2)); +} + +int +main() +{ + test01(); + return 0; +} diff --git a/libstdc++-v3/testsuite/28_regex/iterators/regex_token_iterator/wchar_t/wstring_02.cc b/libstdc++-v3/testsuite/28_regex/iterators/regex_token_iterator/wchar_t/wstring_02.cc new file mode 100644 index 0000000..4e58a4f --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/iterators/regex_token_iterator/wchar_t/wstring_02.cc @@ -0,0 +1,53 @@ +// { dg-options "-std=gnu++11" } +// { dg-require-namedlocale "en_US.UTF-8" } + +// +// 2013-08-24 Tim Shen <timshen91@gmail.com> +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 28.12.2 regex_token_iterator +// Tests regex_token_iterator class over + +#include <regex> +#include <testsuite_hooks.h> + +void +test01() +{ + bool test __attribute__((unused)) = true; + + std::setlocale(LC_ALL, "en_US.UTF-8"); + + std::wstring str2 = L"öäü"; + std::wregex re2; + re2.assign(L"([[:lower:]]+)"); + std::wsmatch m2; + + std::wsregex_token_iterator end {}; + std::wsregex_token_iterator p{str2.begin(), str2.end(), re2, {1}}; + + VERIFY(p == end); +} + +int +main() +{ + test01(); + return 0; +} ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Patch 2/2] Localization problem in regex 2013-08-24 6:15 ` Tim Shen @ 2013-08-29 18:45 ` Tim Shen 0 siblings, 0 replies; 13+ messages in thread From: Tim Shen @ 2013-08-29 18:45 UTC (permalink / raw) To: Stefan Schweter; +Cc: libstdc++, gcc-patches [-- Attachment #1: Type: text/plain, Size: 139 bytes --] > Fixed. > > It's not fully tested. I just run the 28_regex testcases. I'll do a > full test before committing. Committed. -- Tim Shen [-- Attachment #2: changelog --] [-- Type: application/octet-stream, Size: 766 bytes --] 2013-08-29 Tim Shen <timshen91@gmail.com> * include/bits/regex.h (basic_regex<>::assign): Don't lose _M_traits. (regex_iterator<>::regex_iterator): Return nullptr when regex_search failed. (regex_token_iterator<>::_M_end_of_seq): Should be defined true when _M_result is(not isn't) nullptr. * include/bits/regex_compiler.h: Store _Compiler::_M_traits by reference instead of by value. * include/bits/regex_executor.h (_DFSExecutor<>::_DFSExecutor): Add _M_traits to _DFSExecutor. * include/bits/regex_executor.tcc (__get_executor<>): Pass traits to _DFSExecutor too. * testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc: New. * testsuite/28_regex/iterators/regex_token_iterator/wchar_t/ wstring_02.cc: New. [-- Attachment #3: regex-locale.patch --] [-- Type: application/octet-stream, Size: 8436 bytes --] Index: include/bits/regex_compiler.h =================================================================== --- include/bits/regex_compiler.h (revision 202081) +++ include/bits/regex_compiler.h (revision 202082) @@ -214,7 +214,7 @@ return _M_traits.transform(__s.begin(), __s.end()); } - _TraitsT _M_traits; + const _TraitsT& _M_traits; _FlagT _M_flags; bool _M_is_non_matching; std::vector<_CharT> _M_char_set; Index: include/bits/regex_executor.tcc =================================================================== --- include/bits/regex_executor.tcc (revision 202081) +++ include/bits/regex_executor.tcc (revision 202082) @@ -320,7 +320,8 @@ auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>> (__re._M_automaton); if (__p->_M_has_backref) - return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags)); + return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, + __re._M_traits, __flags)); return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags)); } Index: include/bits/regex_executor.h =================================================================== --- include/bits/regex_executor.h (revision 202081) +++ include/bits/regex_executor.h (revision 202082) @@ -120,13 +120,14 @@ typedef typename _BaseT::_ResultsVec _ResultsVec; typedef regex_constants::match_flag_type _FlagT; - _DFSExecutor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - const _RegexT& __nfa, - _FlagT __flags) + _DFSExecutor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + const _RegexT& __nfa, + const _TraitsT& __traits, + _FlagT __flags) : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_traits(_TraitsT()), _M_nfa(__nfa), _M_results_ret(this->_M_results) + _M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results) { } void @@ -142,9 +143,9 @@ bool _M_dfs(_StateIdT __i); - _ResultsVec _M_results_ret; - _TraitsT _M_traits; - const _RegexT& _M_nfa; + _ResultsVec _M_results_ret; + const _TraitsT& _M_traits; + const _RegexT& _M_nfa; }; // Like the DFS approach, it try every possible state transition; Unlike DFS, Index: include/bits/regex.h =================================================================== --- include/bits/regex.h (revision 202081) +++ include/bits/regex.h (revision 202082) @@ -880,8 +880,10 @@ assign(const basic_string<_Ch_type, _Ch_typeraits, _Alloc>& __s, flag_type __flags = ECMAScript) { - basic_regex __tmp(__s, __flags); - this->swap(__tmp); + _M_flags = __flags; + _M_automaton = + __detail::_Compiler<decltype(__s.begin()), _Ch_type, _Rx_traits> + (__s.begin(), __s.end(), _M_traits, _M_flags)._M_get_nfa(); return *this; } @@ -2591,7 +2593,10 @@ regex_constants::match_flag_type __m = regex_constants::match_default) : _M_begin(__a), _M_end(__b), _M_pregex(&__re), _M_flags(__m), _M_match() - { regex_search(_M_begin, _M_end, _M_match, *_M_pregex, _M_flags); } + { + if (!regex_search(_M_begin, _M_end, _M_match, *_M_pregex, _M_flags)) + *this = regex_iterator(); + } /** * Copy constructs a %regex_iterator. @@ -2905,9 +2910,9 @@ return (*_M_position)[_M_subs[_M_n]]; } - bool - _M_end_of_seq() const - { return _M_result != nullptr; } + constexpr bool + _M_end_of_seq() + { return _M_result == nullptr; } _Position _M_position; const value_type* _M_result; Index: ChangeLog =================================================================== --- ChangeLog (revision 202081) +++ ChangeLog (revision 202082) @@ -1,3 +1,21 @@ +2013-08-29 Tim Shen <timshen91@gmail.com> + + * include/bits/regex.h (basic_regex<>::assign): Don't lose _M_traits. + (regex_iterator<>::regex_iterator): Return nullptr when regex_search + failed. + (regex_token_iterator<>::_M_end_of_seq): Should be defined true when + _M_result is(not isn't) nullptr. + * include/bits/regex_compiler.h: Store _Compiler::_M_traits by reference + instead of by value. + * include/bits/regex_executor.h (_DFSExecutor<>::_DFSExecutor): Add + _M_traits to _DFSExecutor. + * include/bits/regex_executor.tcc (__get_executor<>): Pass traits to + _DFSExecutor too. + * testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc: + New. + * testsuite/28_regex/iterators/regex_token_iterator/wchar_t/ + wstring_02.cc: New. + 2013-08-26 Tim Shen <timshen91@gmail.com> * include/Makefile.am: Add regex_scanner.{h,tcc}. Index: testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc =================================================================== --- testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc (revision 0) +++ testsuite/28_regex/algorithms/regex_match/extended/wstring_locale.cc (revision 202082) @@ -0,0 +1,48 @@ +// { dg-options "-std=gnu++11" } +// { dg-require-namedlocale "de_DE.UTF-8" } + +// +// 2013-08-29 Tim Shen <timshen91@gmail.com> +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 28.11.2 regex_match +// Tests Extended localization against a wide-string. + +#include <regex> +#include <testsuite_hooks.h> + +void +test01() +{ + bool test __attribute__((unused)) = true; + + std::wstring str2 = L"ÜBER"; + std::wregex re2; + re2.imbue(std::locale("de_DE.UTF-8")); + re2.assign(L"[[:upper:]]*", std::regex::extended); + std::wsmatch m2; + VERIFY(std::regex_match(str2, m2, re2)); +} + +int +main() +{ + test01(); + return 0; +} Index: testsuite/28_regex/iterators/regex_token_iterator/wchar_t/wstring_02.cc =================================================================== --- testsuite/28_regex/iterators/regex_token_iterator/wchar_t/wstring_02.cc (revision 0) +++ testsuite/28_regex/iterators/regex_token_iterator/wchar_t/wstring_02.cc (revision 202082) @@ -0,0 +1,53 @@ +// { dg-options "-std=gnu++11" } +// { dg-require-namedlocale "en_US.UTF-8" } + +// +// 2013-08-29 Tim Shen <timshen91@gmail.com> +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 28.12.2 regex_token_iterator +// Tests regex_token_iterator class over a localized wstring. + +#include <regex> +#include <testsuite_hooks.h> + +void +test01() +{ + bool test __attribute__((unused)) = true; + + std::setlocale(LC_ALL, "en_US.UTF-8"); + + std::wstring str2 = L"öäü"; + std::wregex re2; + re2.assign(L"([[:lower:]]+)"); + std::wsmatch m2; + + std::wsregex_token_iterator end {}; + std::wsregex_token_iterator p{str2.begin(), str2.end(), re2, {1}}; + + VERIFY(p == end); +} + +int +main() +{ + test01(); + return 0; +} ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2013-08-29 18:41 UTC | newest] Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2013-08-23 9:25 [Patch 2/2] Localization problem in regex Tim Shen 2013-08-23 10:15 ` Jonathan Wakely 2013-08-23 11:08 ` Paolo Carlini 2013-08-23 14:04 ` Tim Shen 2013-08-23 14:27 ` Paolo Carlini 2013-08-23 14:57 ` Tim Shen 2013-08-23 15:02 ` Jonathan Wakely 2013-08-23 15:05 ` Paolo Carlini 2013-08-23 15:48 ` Tim Shen 2013-08-23 16:20 ` Jonathan Wakely 2013-08-23 19:04 ` Stefan Schweter 2013-08-24 6:15 ` Tim Shen 2013-08-29 18:45 ` Tim Shen
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).