public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
@ 2014-10-20 17:28 Tim Shen
  2014-10-21 10:54 ` Jonathan Wakely
  0 siblings, 1 reply; 9+ messages in thread
From: Tim Shen @ 2014-10-20 17:28 UTC (permalink / raw)
  To: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 58 bytes --]

Bootstrapped and tested.

Thanks!


-- 
Regards,
Tim Shen

[-- Attachment #2: a.diff --]
[-- Type: text/plain, Size: 2942 bytes --]

commit 95c73ab6280c1f8182d018ee29a44230965dd4ef
Author: timshen <timshen@google.com>
Date:   Sun Oct 19 15:14:55 2014 -0700

    	PR libstdc++/63497
    	include/bits/regex_executor.h (_Executor::_M_word_boundary): Remove
    	const qualifier.
    	include/bits/regex_executor.tcc (_Executor::_M_dfs,
    	_Executor::_M_word_boundary): Avoid dereferecing _M_current at _M_end
    	or other invalid position.

diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index cd9e55d..b867951 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -145,7 +145,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       }
 
       bool
-      _M_word_boundary(_State<_TraitsT> __state) const;
+      _M_word_boundary(_State<_TraitsT> __state);
 
       bool
       _M_lookahead(_State<_TraitsT> __state);
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index 5eab852..9655c7a 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -284,9 +284,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    _M_dfs(__match_mode, __state._M_next);
 	  break;
 	case _S_opcode_match:
+	  if (_M_current == _M_end)
+	    break;
 	  if (__dfs_mode)
 	    {
-	      if (_M_current != _M_end && __state._M_matches(*_M_current))
+	      if (__state._M_matches(*_M_current))
 		{
 		  ++_M_current;
 		  _M_dfs(__match_mode, __state._M_next);
@@ -407,25 +409,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
 	   bool __dfs_mode>
     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
-    _M_word_boundary(_State<_TraitsT>) const
+    _M_word_boundary(_State<_TraitsT>)
     {
-      // By definition.
-      bool __ans = false;
-      auto __pre = _M_current;
-      --__pre;
-      if (!(_M_at_begin() && _M_at_end()))
+      bool __left_is_word = false;
+      if (_M_current != _M_begin
+	  || (_M_flags & regex_constants::match_prev_avail))
 	{
-	  if (_M_at_begin())
-	    __ans = _M_is_word(*_M_current)
-	      && !(_M_flags & regex_constants::match_not_bow);
-	  else if (_M_at_end())
-	    __ans = _M_is_word(*__pre)
-	      && !(_M_flags & regex_constants::match_not_eow);
-	  else
-	    __ans = _M_is_word(*_M_current)
-	      != _M_is_word(*__pre);
+	  --_M_current;
+	  if (_M_is_word(*_M_current))
+	    __left_is_word = true;
+	  ++_M_current;
 	}
-      return __ans;
+      bool __right_is_word = false;
+      if (_M_current != _M_end && _M_is_word(*_M_current))
+	__right_is_word = true;
+
+      if (__left_is_word == __right_is_word)
+	return false;
+      if (__left_is_word && !(_M_flags & regex_constants::match_not_eow))
+	return true;
+      if (__right_is_word && !(_M_flags & regex_constants::match_not_bow))
+	return true;
+      return false;
     }
 
 _GLIBCXX_END_NAMESPACE_VERSION

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
  2014-10-20 17:28 [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor Tim Shen
@ 2014-10-21 10:54 ` Jonathan Wakely
  2014-10-21 16:48   ` Tim Shen
  0 siblings, 1 reply; 9+ messages in thread
From: Jonathan Wakely @ 2014-10-21 10:54 UTC (permalink / raw)
  To: Tim Shen; +Cc: libstdc++, gcc-patches

On 20/10/14 10:23 -0700, Tim Shen wrote:
>Bootstrapped and tested.

Did you manage to produce a testcase that crashed on trunk?

>@@ -407,25 +409,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>   template<typename _BiIter, typename _Alloc, typename _TraitsT,
> 	   bool __dfs_mode>
>     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
>-    _M_word_boundary(_State<_TraitsT>) const
>+    _M_word_boundary(_State<_TraitsT>)
>     {
>-      // By definition.
>-      bool __ans = false;
>-      auto __pre = _M_current;
>-      --__pre;
>-      if (!(_M_at_begin() && _M_at_end()))
>+      bool __left_is_word = false;
>+      if (_M_current != _M_begin
>+	  || (_M_flags & regex_constants::match_prev_avail))
> 	{
>-	  if (_M_at_begin())
>-	    __ans = _M_is_word(*_M_current)
>-	      && !(_M_flags & regex_constants::match_not_bow);
>-	  else if (_M_at_end())
>-	    __ans = _M_is_word(*__pre)
>-	      && !(_M_flags & regex_constants::match_not_eow);
>-	  else
>-	    __ans = _M_is_word(*_M_current)
>-	      != _M_is_word(*__pre);
>+	  --_M_current;
>+	  if (_M_is_word(*_M_current))
>+	    __left_is_word = true;
>+	  ++_M_current;

Is it really necessary to modify _M_current here?
Couldn't you do:

       auto __pre = _M_current;
       if (_M_is_word(*--__pre))
         __left_is_word = true;

Then the function could remain const, couldn't it?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
  2014-10-21 10:54 ` Jonathan Wakely
@ 2014-10-21 16:48   ` Tim Shen
  2014-10-22 14:39     ` Jonathan Wakely
  0 siblings, 1 reply; 9+ messages in thread
From: Tim Shen @ 2014-10-21 16:48 UTC (permalink / raw)
  To: Jonathan Wakely; +Cc: libstdc++, gcc-patches

On Tue, Oct 21, 2014 at 3:25 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
> Did you manage to produce a testcase that crashed on trunk?

Oh I forgot to mention that I've tried my best to make a testcase that
crash the trunk, but failed :).

I'm not sure if I should directly put an assert in the code and make a
testcase to explode it. Now I think it's better to do it.

> Is it really necessary to modify _M_current here?
> Couldn't you do:
>
>       auto __pre = _M_current;
>       if (_M_is_word(*--__pre))
>         __left_is_word = true;
>
> Then the function could remain const, couldn't it?

That's exactly what I did in the early version of this patch. But
later I changed because I assume that copying an iterator is
potentially expensive, but mutating is cheaper.

Making this function const may bring some optimization, doesn't it?
But I have no idea how much it will bring and if it's worthy.


-- 
Regards,
Tim Shen

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
  2014-10-21 16:48   ` Tim Shen
@ 2014-10-22 14:39     ` Jonathan Wakely
  2014-10-22 22:04       ` Tim Shen
  0 siblings, 1 reply; 9+ messages in thread
From: Jonathan Wakely @ 2014-10-22 14:39 UTC (permalink / raw)
  To: Tim Shen; +Cc: libstdc++, gcc-patches

On 21/10/14 09:45 -0700, Tim Shen wrote:
>On Tue, Oct 21, 2014 at 3:25 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
>> Did you manage to produce a testcase that crashed on trunk?
>
>Oh I forgot to mention that I've tried my best to make a testcase that
>crash the trunk, but failed :).
>
>I'm not sure if I should directly put an assert in the code and make a
>testcase to explode it. Now I think it's better to do it.

Only if it's likely to catch problems in future. If you'd be putting
it in only to make a testcase fail then it's not worth it.

>> Is it really necessary to modify _M_current here?
>> Couldn't you do:
>>
>>       auto __pre = _M_current;
>>       if (_M_is_word(*--__pre))
>>         __left_is_word = true;
>>
>> Then the function could remain const, couldn't it?
>
>That's exactly what I did in the early version of this patch. But
>later I changed because I assume that copying an iterator is
>potentially expensive, but mutating is cheaper.

In general iterators are always passed by value and should be cheap to
copy. Inside regex the iterator is usually a const char* so is very
cheap to copy.

>Making this function const may bring some optimization, doesn't it?
>But I have no idea how much it will bring and if it's worthy.

It's unlikely (see http://www.gotw.ca/gotw/081.htm).

I just don't see the point in making it a non-const function just to
perform a micro-optimisation.

If you were passing an integer to a function would you do
  f(i-1);
or
  --i;
  f(i);
  ++i; 
?

The first form seems obviously better to me.

You could even simplify it further using std::prev:

      if (_M_is_word(*std::prev(__pre)))
        __left_is_word = true;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
  2014-10-22 14:39     ` Jonathan Wakely
@ 2014-10-22 22:04       ` Tim Shen
  2014-10-22 23:05         ` Jonathan Wakely
  0 siblings, 1 reply; 9+ messages in thread
From: Tim Shen @ 2014-10-22 22:04 UTC (permalink / raw)
  To: Jonathan Wakely; +Cc: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 459 bytes --]

On Wed, Oct 22, 2014 at 7:34 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
> Only if it's likely to catch problems in future. If you'd be putting
> it in only to make a testcase fail then it's not worth it.

No, since I think with this patch _M_current is clearly valid when
being dereferenced.

> You could even simplify it further using std::prev:
>
>      if (_M_is_word(*std::prev(__pre)))
>        __left_is_word = true;

Done.


-- 
Regards,
Tim Shen

[-- Attachment #2: a.diff --]
[-- Type: text/plain, Size: 3246 bytes --]

commit 386598d6edb729053325921a0bad66c413dc93ea
Author: timshen <timshen@google.com>
Date:   Sun Oct 19 15:14:55 2014 -0700

    	PR libstdc++/63497
    	include/bits/regex_executor.h (_Executor::_M_word_boundary): Remove
    	unused parameter.
    	include/bits/regex_executor.tcc (_Executor::_M_dfs,
    	_Executor::_M_word_boundary): Avoid dereferecing _M_current at _M_end
    	or other invalid position.

diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index cd9e55d..b26992c 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -145,7 +145,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       }
 
       bool
-      _M_word_boundary(_State<_TraitsT> __state) const;
+      _M_word_boundary() const;
 
       bool
       _M_lookahead(_State<_TraitsT> __state);
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index 5eab852..38d4781 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -274,7 +274,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    _M_dfs(__match_mode, __state._M_next);
 	  break;
 	case _S_opcode_word_boundary:
-	  if (_M_word_boundary(__state) == !__state._M_neg)
+	  if (_M_word_boundary() == !__state._M_neg)
 	    _M_dfs(__match_mode, __state._M_next);
 	  break;
 	// Here __state._M_alt offers a single start node for a sub-NFA.
@@ -284,9 +284,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    _M_dfs(__match_mode, __state._M_next);
 	  break;
 	case _S_opcode_match:
+	  if (_M_current == _M_end)
+	    break;
 	  if (__dfs_mode)
 	    {
-	      if (_M_current != _M_end && __state._M_matches(*_M_current))
+	      if (__state._M_matches(*_M_current))
 		{
 		  ++_M_current;
 		  _M_dfs(__match_mode, __state._M_next);
@@ -407,25 +409,26 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
 	   bool __dfs_mode>
     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
-    _M_word_boundary(_State<_TraitsT>) const
+    _M_word_boundary() const
     {
-      // By definition.
-      bool __ans = false;
-      auto __pre = _M_current;
-      --__pre;
-      if (!(_M_at_begin() && _M_at_end()))
+      bool __left_is_word = false;
+      if (_M_current != _M_begin
+	  || (_M_flags & regex_constants::match_prev_avail))
 	{
-	  if (_M_at_begin())
-	    __ans = _M_is_word(*_M_current)
-	      && !(_M_flags & regex_constants::match_not_bow);
-	  else if (_M_at_end())
-	    __ans = _M_is_word(*__pre)
-	      && !(_M_flags & regex_constants::match_not_eow);
-	  else
-	    __ans = _M_is_word(*_M_current)
-	      != _M_is_word(*__pre);
+	  auto __prev = _M_current;
+	  if (_M_is_word(*std::prev(__prev)))
+	    __left_is_word = true;
 	}
-      return __ans;
+      bool __right_is_word =
+        _M_current != _M_end && _M_is_word(*_M_current);
+
+      if (__left_is_word == __right_is_word)
+	return false;
+      if (__left_is_word && !(_M_flags & regex_constants::match_not_eow))
+	return true;
+      if (__right_is_word && !(_M_flags & regex_constants::match_not_bow))
+	return true;
+      return false;
     }
 
 _GLIBCXX_END_NAMESPACE_VERSION

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
  2014-10-22 22:04       ` Tim Shen
@ 2014-10-22 23:05         ` Jonathan Wakely
  2014-10-23  3:29           ` Tim Shen
  0 siblings, 1 reply; 9+ messages in thread
From: Jonathan Wakely @ 2014-10-22 23:05 UTC (permalink / raw)
  To: Tim Shen; +Cc: libstdc++, gcc-patches

On 22/10/14 14:53 -0700, Tim Shen wrote:
>On Wed, Oct 22, 2014 at 7:34 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
>> Only if it's likely to catch problems in future. If you'd be putting
>> it in only to make a testcase fail then it's not worth it.
>
>No, since I think with this patch _M_current is clearly valid when
>being dereferenced.
>
>> You could even simplify it further using std::prev:
>>
>>      if (_M_is_word(*std::prev(__pre)))
>>        __left_is_word = true;
>
>Done.

OK to commit, thanks!

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
  2014-10-22 23:05         ` Jonathan Wakely
@ 2014-10-23  3:29           ` Tim Shen
  2014-11-25  8:46             ` Tim Shen
  0 siblings, 1 reply; 9+ messages in thread
From: Tim Shen @ 2014-10-23  3:29 UTC (permalink / raw)
  To: Jonathan Wakely; +Cc: libstdc++, gcc-patches

On Wed, Oct 22, 2014 at 4:04 PM, Jonathan Wakely <jwakely@redhat.com> wrote:
> OK to commit, thanks!

Committed. Thank you too!


-- 
Regards,
Tim Shen

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
  2014-10-23  3:29           ` Tim Shen
@ 2014-11-25  8:46             ` Tim Shen
  2014-11-25 11:11               ` Jonathan Wakely
  0 siblings, 1 reply; 9+ messages in thread
From: Tim Shen @ 2014-11-25  8:46 UTC (permalink / raw)
  To: Jonathan Wakely; +Cc: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 256 bytes --]

On Wed, Oct 22, 2014 at 8:19 PM, Tim Shen <timshen@google.com> wrote:
> Committed. Thank you too!

I'm backporting this patch to gcc-4_9-branch. Do we usually boot &
test it and then commit directly, or it should be reviewed again?


-- 
Regards,
Tim Shen

[-- Attachment #2: b.diff --]
[-- Type: text/plain, Size: 3247 bytes --]

commit 1e146769d08ff19cc01a08b91ca8fd3151f34faf
Author: timshen <timshen@google.com>
Date:   Tue Nov 25 00:36:25 2014 -0800

    	PR libstdc++/63497
    	include/bits/regex_executor.h (_Executor::_M_word_boundary): Remove
    	unused parameter.
    	include/bits/regex_executor.tcc (_Executor::_M_dfs,
    	_Executor::_M_word_boundary): Avoid dereferecing _M_current at _M_end
    	or other invalid position.

diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index 708c78e..0d1b676 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -134,7 +134,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       }
 
       bool
-      _M_word_boundary(_State<_TraitsT> __state) const;
+      _M_word_boundary() const;
 
       bool
       _M_lookahead(_State<_TraitsT> __state);
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index 052302b..ef49161 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -257,7 +257,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    _M_dfs<__match_mode>(__state._M_next);
 	  break;
 	case _S_opcode_word_boundary:
-	  if (_M_word_boundary(__state) == !__state._M_neg)
+	  if (_M_word_boundary() == !__state._M_neg)
 	    _M_dfs<__match_mode>(__state._M_next);
 	  break;
 	// Here __state._M_alt offers a single start node for a sub-NFA.
@@ -267,9 +267,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    _M_dfs<__match_mode>(__state._M_next);
 	  break;
 	case _S_opcode_match:
+	  if (_M_current == _M_end)
+	    break;
 	  if (__dfs_mode)
 	    {
-	      if (_M_current != _M_end && __state._M_matches(*_M_current))
+	      if (__state._M_matches(*_M_current))
 		{
 		  ++_M_current;
 		  _M_dfs<__match_mode>(__state._M_next);
@@ -348,25 +350,26 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
     bool __dfs_mode>
     bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
-    _M_word_boundary(_State<_TraitsT> __state) const
+    _M_word_boundary() const
     {
-      // By definition.
-      bool __ans = false;
-      auto __pre = _M_current;
-      --__pre;
-      if (!(_M_at_begin() && _M_at_end()))
+      bool __left_is_word = false;
+      if (_M_current != _M_begin
+	  || (_M_flags & regex_constants::match_prev_avail))
 	{
-	  if (_M_at_begin())
-	    __ans = _M_is_word(*_M_current)
-	      && !(_M_flags & regex_constants::match_not_bow);
-	  else if (_M_at_end())
-	    __ans = _M_is_word(*__pre)
-	      && !(_M_flags & regex_constants::match_not_eow);
-	  else
-	    __ans = _M_is_word(*_M_current)
-	      != _M_is_word(*__pre);
+	  auto __prev = _M_current;
+	  if (_M_is_word(*std::prev(__prev)))
+	    __left_is_word = true;
 	}
-      return __ans;
+      bool __right_is_word =
+	_M_current != _M_end && _M_is_word(*_M_current);
+
+      if (__left_is_word == __right_is_word)
+	return false;
+      if (__left_is_word && !(_M_flags & regex_constants::match_not_eow))
+	return true;
+      if (__right_is_word && !(_M_flags & regex_constants::match_not_bow))
+	return true;
+      return false;
     }
 
 _GLIBCXX_END_NAMESPACE_VERSION

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor
  2014-11-25  8:46             ` Tim Shen
@ 2014-11-25 11:11               ` Jonathan Wakely
  0 siblings, 0 replies; 9+ messages in thread
From: Jonathan Wakely @ 2014-11-25 11:11 UTC (permalink / raw)
  To: Tim Shen; +Cc: libstdc++, gcc-patches

On 25/11/14 00:41 -0800, Tim Shen wrote:
>On Wed, Oct 22, 2014 at 8:19 PM, Tim Shen <timshen@google.com> wrote:
>> Committed. Thank you too!
>
>I'm backporting this patch to gcc-4_9-branch. Do we usually boot &
>test it and then commit directly, or it should be reviewed again?

I approved it for the branch (in the bugzilla comments) so usually you
could just test it and commit it ... but since you asked ... maybe you
should leave the _M_word_boundary signature unchanged for the branch,
since the unused parameter doesn't do any harm and removing it isn't
needed for the fix to work.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-11-25 10:35 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-20 17:28 [Patch, libstdc++/63497] Avoid dereferencing invalid iterator in regex_executor Tim Shen
2014-10-21 10:54 ` Jonathan Wakely
2014-10-21 16:48   ` Tim Shen
2014-10-22 14:39     ` Jonathan Wakely
2014-10-22 22:04       ` Tim Shen
2014-10-22 23:05         ` Jonathan Wakely
2014-10-23  3:29           ` Tim Shen
2014-11-25  8:46             ` Tim Shen
2014-11-25 11:11               ` Jonathan Wakely

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).