public inbox for libc-hacker@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Fix re_string_reconstruct if pstr->offsets_needed
@ 2006-09-06 16:46 Jakub Jelinek
  2006-09-07 13:50 ` Ulrich Drepper
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2006-09-06 16:46 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Glibc hackers

Hi!

As the testcase below shows, we weren't handling pstr->offsets_needed
in re_string_reconstruct properly.  One issue is that the index
passed re_string_context_at must be 0 .. pstr->valid_len - 1, rather
than 0 .. pstr->valid_raw_len - 1, as the index is used to access
the pstr->wcs array.  But another, more serious issue is that I thought
the offset < pstr->valid_raw_len && pstr->offsets_needed case can be
handled by the re_string_skip_chars block - it can't.  We'd need to figure
out what exact argument to pass to re_string_context_at, also using
re_string_skip_chars might give us bigger valid_len than bufs_len
and as the testcase shows that leads to corruptions past the end of malloced
buffers.

2006-09-06  Jakub Jelinek  <jakub@redhat.com>

	* posix/regex_internal.c (re_string_reconstruct): Handle
	offset < pstr->valid_raw_len && pstr->offsets_needed case.
	Ensure no bytes read before raw_mbs array.  Pass a saved copy of
	pstr->valid_len - 1 rather than pstr->valid_raw_len - 1 to
	re_string_context_at.
	* posix/Makefile: Add rules to build and run bug-regex26 test.
	* posix/bug-regex26.c: New test.

--- libc/posix/regex_internal.c.jj	2006-09-06 18:12:56.000000000 +0200
+++ libc/posix/regex_internal.c	2006-09-06 18:14:02.000000000 +0200
@@ -585,34 +585,98 @@ re_string_reconstruct (re_string_t *pstr
 
   if (BE (offset != 0, 1))
     {
-      /* Are the characters which are already checked remain?  */
-      if (BE (offset < pstr->valid_raw_len, 1)
-#ifdef RE_ENABLE_I18N
-	  /* Handling this would enlarge the code too much.
-	     Accept a slowdown in that case.  */
-	  && pstr->offsets_needed == 0
-#endif
-	 )
+      /* Should the already checked characters be kept?  */
+      if (BE (offset < pstr->valid_raw_len, 1))
 	{
 	  /* Yes, move them to the front of the buffer.  */
-	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 #ifdef RE_ENABLE_I18N
-	  if (pstr->mb_cur_max > 1)
-	    memmove (pstr->wcs, pstr->wcs + offset,
-		     (pstr->valid_len - offset) * sizeof (wint_t));
+	  if (BE (pstr->offsets_needed, 0))
+	    {
+	      int low = 0, high = pstr->valid_len, mid;
+	      do
+		{
+		  mid = (high + low) / 2;
+		  if (pstr->offsets[mid] > offset)
+		    high = mid;
+		  else if (pstr->offsets[mid] < offset)
+		    low = mid + 1;
+		  else
+		    break;
+		}
+	      while (low < high);
+	      if (pstr->offsets[mid] < offset)
+		++mid;
+	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
+							eflags);
+	      /* This can be quite complicated, so handle specially
+		 only the common and easy case where the character with
+		 different length representation of lower and upper
+		 case is present at or after offset.  */
+	      if (pstr->valid_len > offset
+		  && mid == offset && pstr->offsets[mid] == offset)
+		{
+		  memmove (pstr->wcs, pstr->wcs + offset,
+			   (pstr->valid_len - offset) * sizeof (wint_t));
+		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
+		  pstr->valid_len -= offset;
+		  pstr->valid_raw_len -= offset;
+		  for (low = 0; low < pstr->valid_len; low++)
+		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
+		}
+	      else
+		{
+		  /* Otherwise, just find out how long the partial multibyte
+		     character at offset is and fill it with WEOF/255.  */
+		  pstr->len = pstr->raw_len - idx + offset;
+		  pstr->stop = pstr->raw_stop - idx + offset;
+		  pstr->offsets_needed = 0;
+		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
+		    --mid;
+		  while (mid < pstr->valid_len)
+		    if (pstr->wcs[mid] != WEOF)
+		      break;
+		    else
+		      ++mid;
+		  if (mid == pstr->valid_len)
+		    pstr->valid_len = 0;
+		  else
+		    {
+		      pstr->valid_len = pstr->offsets[mid] - offset;
+		      if (pstr->valid_len)
+			{
+			  for (low = 0; low < pstr->valid_len; ++low)
+			    pstr->wcs[low] = WEOF;
+			  memset (pstr->mbs, 255, pstr->valid_len);
+			}
+		    }
+		  pstr->valid_raw_len = pstr->valid_len;
+		}
+	    }
+	  else
+#endif
+	    {
+	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
+							eflags);
+#ifdef RE_ENABLE_I18N
+	      if (pstr->mb_cur_max > 1)
+		memmove (pstr->wcs, pstr->wcs + offset,
+			 (pstr->valid_len - offset) * sizeof (wint_t));
 #endif /* RE_ENABLE_I18N */
-	  if (BE (pstr->mbs_allocated, 0))
-	    memmove (pstr->mbs, pstr->mbs + offset,
-		     pstr->valid_len - offset);
-	  pstr->valid_len -= offset;
-	  pstr->valid_raw_len -= offset;
+	      if (BE (pstr->mbs_allocated, 0))
+		memmove (pstr->mbs, pstr->mbs + offset,
+			 pstr->valid_len - offset);
+	      pstr->valid_len -= offset;
+	      pstr->valid_raw_len -= offset;
 #if DEBUG
-	  assert (pstr->valid_len > 0);
+	      assert (pstr->valid_len > 0);
 #endif
+	    }
 	}
       else
 	{
 	  /* No, skip all characters until IDX.  */
+	  int prev_valid_len = pstr->valid_len;
+
 #ifdef RE_ENABLE_I18N
 	  if (BE (pstr->offsets_needed, 0))
 	    {
@@ -636,6 +700,8 @@ re_string_reconstruct (re_string_t *pstr
 		     byte other than 0x80 - 0xbf.  */
 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 		  end = raw + (offset - pstr->mb_cur_max);
+		  if (end < pstr->raw_mbs)
+		    end = pstr->raw_mbs;
 		  p = raw + offset - 1;
 #ifdef _LIBC
 		  /* We know the wchar_t encoding is UCS4, so for the simple
@@ -643,7 +709,7 @@ re_string_reconstruct (re_string_t *pstr
 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
 		    {
 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
-		      pstr->valid_len = 0;
+		      /* pstr->valid_len = 0; */
 		      wc = (wchar_t) *p;
 		    }
 		  else
@@ -686,7 +752,7 @@ re_string_reconstruct (re_string_t *pstr
 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 	      if (wc == WEOF)
 		pstr->tip_context
-		  = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
+		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 	      else
 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 				      && IS_WIDE_WORD_CHAR (wc))
--- libc/posix/bug-regex26.c.jj	2006-09-06 18:12:36.000000000 +0200
+++ libc/posix/bug-regex26.c	2006-09-06 18:12:30.000000000 +0200
@@ -0,0 +1,38 @@
+/* Test re_search with dotless i.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2006.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <locale.h>
+#include <regex.h>
+#include <string.h>
+
+int
+main (void)
+{
+  struct re_pattern_buffer r;
+  struct re_registers s;
+  setlocale (LC_ALL, "en_US.UTF-8");
+  memset (&r, 0, sizeof (r));
+  memset (&s, 0, sizeof (s));
+  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | RE_ICASE);
+  re_compile_pattern ("insert into", 11, &r);
+  re_search (&r, "\xFF\0\x12\xA2\xAA\xC4\xB1,K\x12\xC4\xB1*\xACK",
+	     15, 0, 15, &s);
+  return 0;
+}
--- libc/posix/Makefile.jj	2006-08-02 18:43:46.000000000 +0200
+++ libc/posix/Makefile	2006-09-06 18:13:47.000000000 +0200
@@ -81,7 +81,7 @@ tests		:= tstgetopt testfnm runtests run
 		   bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
 		   bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
 		   bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
-		   bug-regex25 tst-nice tst-nanosleep tst-regex2 \
+		   bug-regex25 bug-regex26 tst-nice tst-nanosleep tst-regex2 \
 		   transbug tst-rxspencer tst-pcre tst-boost \
 		   bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \
 		   tst-getaddrinfo2 bug-glob1 bug-glob2 tst-sysconf \
@@ -189,6 +189,7 @@ bug-regex20-ENV = LOCPATH=$(common-objpf
 bug-regex22-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex23-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex25-ENV = LOCPATH=$(common-objpfx)localedata
+bug-regex26-ENV = LOCPATH=$(common-objpfx)localedata
 tst-rxspencer-ARGS = --utf8 rxspencer/tests
 tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata
 tst-pcre-ARGS = PCRE.tests

	Jakub

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] Fix re_string_reconstruct if pstr->offsets_needed
  2006-09-06 16:46 [PATCH] Fix re_string_reconstruct if pstr->offsets_needed Jakub Jelinek
@ 2006-09-07 13:50 ` Ulrich Drepper
  0 siblings, 0 replies; 2+ messages in thread
From: Ulrich Drepper @ 2006-09-07 13:50 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Glibc hackers

[-- Attachment #1: Type: text/plain, Size: 101 bytes --]

Applied.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2006-09-07 13:50 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-09-06 16:46 [PATCH] Fix re_string_reconstruct if pstr->offsets_needed Jakub Jelinek
2006-09-07 13:50 ` Ulrich Drepper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).