From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <libc-hacker-return-7012-listarch-libc-hacker=sources.redhat.com@sources.redhat.com>
Received: (qmail 9130 invoked by alias); 25 Nov 2003 13:50:31 -0000
Mailing-List: contact libc-hacker-help@sources.redhat.com; run by ezmlm
Precedence: bulk
List-Subscribe: <mailto:libc-hacker-subscribe@sources.redhat.com>
List-Archive: <http://sources.redhat.com/ml/libc-hacker/>
List-Post: <mailto:libc-hacker@sources.redhat.com>
List-Help: <mailto:libc-hacker-help@sources.redhat.com>, <http://sources.redhat.com/ml/#faqs>
Sender: libc-hacker-owner@sources.redhat.com
Received: (qmail 9094 invoked from network); 25 Nov 2003 13:50:30 -0000
Received: from unknown (HELO sunsite.ms.mff.cuni.cz) (195.113.19.66)
  by sources.redhat.com with SMTP; 25 Nov 2003 13:50:30 -0000
Received: from sunsite.ms.mff.cuni.cz (sunsite.mff.cuni.cz [127.0.0.1])
	by sunsite.ms.mff.cuni.cz (8.12.8/8.12.8) with ESMTP id hAPBjP2c030606;
	Tue, 25 Nov 2003 12:45:25 +0100
Received: (from jakub@localhost)
	by sunsite.ms.mff.cuni.cz (8.12.8/8.12.8/Submit) id hAPBjOlh030602;
	Tue, 25 Nov 2003 12:45:24 +0100
Date: Tue, 25 Nov 2003 19:19:00 -0000
From: Jakub Jelinek <jakub@redhat.com>
To: Ulrich Drepper <drepper@redhat.com>, Roland McGrath <roland@redhat.com>
Cc: Glibc hackers <libc-hacker@sources.redhat.com>
Subject: [PATCH] Fix bug-regex20.c
Message-ID: <20031125114524.GZ12344@sunsite.ms.mff.cuni.cz>
Reply-To: Jakub Jelinek <jakub@redhat.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.4i
X-SW-Source: 2003-11/txt/msg00137.txt.bz2

Hi!

re_string_reconstruct relied on bufs_len >= pstr->mb_cur_max.
Also my UTF-8 re_string_reconstruct optimization unnecessarily didn't
handle idx in the middle of UTF-8 character and made a fallback to
the expensive re_string_skip_chars.
With this patch, all glibc regex tests pass even with
LD_PRELOAD=libefence.so.0.

2003-11-25  Jakub Jelinek  <jakub@redhat.com>

	* posix/regex_internal.c (re_string_allocate): Make sure init_len
	is at least dfa->mb_cur_max.
	(re_string_reconstruct): If is_utf8, don't fall back into
	re_string_skip_chars just because idx points into a middle of
	valid UTF-8 character.  Instead, set the wcs bytes which correspond
	to the partial character bytes to WEOF.
	* posix/regexec.c (re_search_internal): Allocate input.bufs_len + 1
	instead of dfa->nodes_len + 1 state_log entries initially.
	* posix/bug-regex20.c (main): Uncomment backwards case insensitive
	tests.

--- libc/posix/bug-regex20.c.jj	2003-11-19 10:24:00.000000000 +0100
+++ libc/posix/bug-regex20.c	2003-11-25 01:37:43.000000000 +0100
@@ -271,7 +271,6 @@ main (void)
 	  continue;
 	}
 
-      /* XXX: This causes regex segfault.  Disable for now.
       res = re_search (&regbuf, tests[i].string, str_len, str_len, -str_len,
 		       NULL);
       if (res != tests[i].res)
@@ -280,7 +279,7 @@ main (void)
 	  ret = 1;
 	  regfree (&regbuf);
 	  continue;
-	}  */
+	}
       regfree (&regbuf);
     }
 
--- libc/posix/regexec.c.jj	2003-11-24 23:49:53.000000000 +0100
+++ libc/posix/regexec.c	2003-11-25 13:06:02.000000000 +0100
@@ -620,7 +620,7 @@ re_search_internal (preg, string, length
      multi character collating element.  */
   if (nmatch > 1 || dfa->has_mb_node)
     {
-      mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
+      mctx.state_log = re_malloc (re_dfastate_t *, input.bufs_len + 1);
       if (BE (mctx.state_log == NULL, 0))
 	{
 	  err = REG_ESPACE;
--- libc/posix/regex_internal.c.jj	2003-11-24 09:54:20.000000000 +0100
+++ libc/posix/regex_internal.c	2003-11-25 13:26:45.000000000 +0100
@@ -55,7 +55,12 @@ re_string_allocate (pstr, str, len, init
      const re_dfa_t *dfa;
 {
   reg_errcode_t ret;
-  int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
+  int init_buf_len;
+
+  /* Ensure at least one character fits into the buffers.  */
+  if (init_len < dfa->mb_cur_max)
+    init_len = dfa->mb_cur_max;
+  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
   re_string_construct_common (str, len, pstr, trans, icase, dfa);
   pstr->stop = pstr->len;
 
@@ -516,33 +521,33 @@ re_string_reconstruct (pstr, idx, eflags
 		  /* Special case UTF-8.  Multi-byte chars start with any
 		     byte other than 0x80 - 0xbf.  */
 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
-		  end = raw + (pstr->valid_len > offset - pstr->mb_cur_max
-			       ? pstr->valid_len : offset - pstr->mb_cur_max);
+		  end = raw + (offset - pstr->mb_cur_max);
 		  for (p = raw + offset - 1; p >= end; --p)
 		    if ((*p & 0xc0) != 0x80)
 		      {
 			mbstate_t cur_state;
 			wchar_t wc2;
+			int mlen;
 
 			/* XXX Don't use mbrtowc, we know which conversion
 			   to use (UTF-8 -> UCS4).  */
 			memset (&cur_state, 0, sizeof (cur_state));
-			if (mbrtowc (&wc2, p, raw + offset - p, &cur_state)
-			    == raw + offset - p)
+			mlen = mbrtowc (&wc2, p, raw + pstr->len - p,
+					&cur_state) - (raw + offset - p);
+			if (mlen >= 0)
 			  {
 			    memset (&pstr->cur_state, '\0',
 				    sizeof (mbstate_t));
+			    pstr->valid_len = mlen;
 			    wc = wc2;
 			  }
 			break;
 		      }
 		}
 	      if (wc == WEOF)
-		{
-		  pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
-		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
-		    pstr->wcs[wcs_idx] = WEOF;
-		}
+		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+	      for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
+		pstr->wcs[wcs_idx] = WEOF;
 	      if (pstr->trans && wc <= 0xff)
 		wc = pstr->trans[wc];
 	      pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD

	Jakub