public inbox for libc-hacker@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Fix re_search with multibyte locales other than UTF-8
@ 2006-06-02 14:39 Jakub Jelinek
  2006-06-04  4:59 ` Ulrich Drepper
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2006-06-02 14:39 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Glibc hackers

Hi!

As the attached testcase shows, we have some issues with non-UTF-8 mb
charset handling.  One bug (the one that causes the failure) is that
we clear valid_raw_len but then in re_string_skip_chars (which has just this single
caller) we use that.  This means that we can try to convert bytes from the
middle of multibyte character and return non-zero valid_len when a mb
character starts at idx.  Another thing is that if mbrtowc failed, wc would
be undefined.  Also, if idx is close after the end of current valid raw
string, there might not be any complete characters re_string_skip_chars
would skip over and in that case we want to use context from the end of
the previous valid buffer.

2006-06-02  Jakub Jelinek  <jakub@redhat.com>

	* posix/regex_internal.c (re_string_skip_chars): If no character has been
	converted at all, set *last_wc to WEOF.  If mbrtowc failed, set wc to the
	byte which couldn't be converted.
	(re_string_reconstruct): Don't clear valid_raw_len before calling
	re_string_skip_chars.  If wc is WEOF after re_string_skip_chars, set
	tip_context using re_string_context_at.
	* posix/Makefile: Add rules to build and run bug-regex25 test.
	* posix/bug-regex25.c: New test.

--- libc/posix/regex_internal.c.jj	2006-06-02 16:19:33.000000000 +0200
+++ libc/posix/regex_internal.c	2006-06-02 16:19:42.000000000 +0200
@@ -482,7 +482,7 @@ re_string_skip_chars (re_string_t *pstr,
   mbstate_t prev_st;
   int rawbuf_idx;
   size_t mbclen;
-  wchar_t wc = 0;
+  wchar_t wc = WEOF;
 
   /* Skip the characters which are not necessary to check.  */
   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
@@ -495,7 +495,11 @@ re_string_skip_chars (re_string_t *pstr,
 			remain_len, &pstr->cur_state);
       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 	{
-	  /* We treat these cases as a singlebyte character.  */
+	  /* We treat these cases as a single byte character.  */
+	  if (mbclen == 0 || remain_len == 0)
+	    wc = L'\0';
+	  else
+	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 	  mbclen = 1;
 	  pstr->cur_state = prev_st;
 	}
@@ -618,7 +622,6 @@ re_string_reconstruct (re_string_t *pstr
 	    }
 #endif
 	  pstr->valid_len = 0;
-	  pstr->valid_raw_len = 0;
 #ifdef RE_ENABLE_I18N
 	  if (pstr->mb_cur_max > 1)
 	    {
@@ -681,6 +684,16 @@ re_string_reconstruct (re_string_t *pstr
 
 	      if (wc == WEOF)
 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+	      if (wc == WEOF)
+		pstr->tip_context
+		  = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
+	      else
+		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
+				      && IS_WIDE_WORD_CHAR (wc))
+				     ? CONTEXT_WORD
+				     : ((IS_WIDE_NEWLINE (wc)
+					 && pstr->newline_anchor)
+					? CONTEXT_NEWLINE : 0));
 	      if (BE (pstr->valid_len, 0))
 		{
 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
@@ -689,17 +702,12 @@ re_string_reconstruct (re_string_t *pstr
 		    memset (pstr->mbs, 255, pstr->valid_len);
 		}
 	      pstr->valid_raw_len = pstr->valid_len;
-	      pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
-				    && IS_WIDE_WORD_CHAR (wc))
-				   ? CONTEXT_WORD
-				   : ((IS_WIDE_NEWLINE (wc)
-				       && pstr->newline_anchor)
-				      ? CONTEXT_NEWLINE : 0));
 	    }
 	  else
 #endif /* RE_ENABLE_I18N */
 	    {
 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
+	      pstr->valid_raw_len = 0;
 	      if (pstr->trans)
 		c = pstr->trans[c];
 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
--- libc/posix/Makefile.jj	2006-05-03 21:38:02.000000000 +0200
+++ libc/posix/Makefile	2006-06-02 16:20:27.000000000 +0200
@@ -81,7 +81,7 @@ tests		:= tstgetopt testfnm runtests run
 		   bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
 		   bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
 		   bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
-		   tst-nice tst-nanosleep tst-regex2 \
+		   bug-regex25 tst-nice tst-nanosleep tst-regex2 \
 		   transbug tst-rxspencer tst-pcre tst-boost \
 		   bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \
 		   tst-getaddrinfo2 bug-glob1 bug-glob2 tst-sysconf \
@@ -188,6 +188,7 @@ bug-regex19-ENV = LOCPATH=$(common-objpf
 bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex22-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex23-ENV = LOCPATH=$(common-objpfx)localedata
+bug-regex25-ENV = LOCPATH=$(common-objpfx)localedata
 tst-rxspencer-ARGS = --utf8 rxspencer/tests
 tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata
 tst-pcre-ARGS = PCRE.tests
--- libc/posix/bug-regex25.c.jj	2006-06-02 16:14:35.000000000 +0200
+++ libc/posix/bug-regex25.c	2006-06-02 16:18:49.000000000 +0200
@@ -0,0 +1,57 @@
+/* Test re_search in multibyte locale other than UTF-8.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2006.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <locale.h>
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+
+const char *str1 = "\xa3\xd8\xa3\xc9\xa3\xc9";
+const char *str2 = "\xa3\xd8\xa3\xc9";
+
+int
+main (void)
+{
+  setlocale (LC_ALL, "ja_JP.eucJP");
+
+  re_set_syntax (RE_SYNTAX_SED);
+
+  struct re_pattern_buffer re;
+  memset (&re, 0, sizeof (re));
+
+  struct re_registers regs;
+  memset (&regs, 0, sizeof (regs));
+
+  re_compile_pattern ("$", 1, &re);
+
+  int ret = 0, r = re_search (&re, str1, 4, 0, 4, &regs);
+  if (r != 4)
+    {
+      printf ("First re_search returned %d\n", r);
+      ret = 1;
+    }
+  r = re_search (&re, str2, 4, 0, 4, &regs);
+  if (r != 4)
+    {
+      printf ("Second re_search returned %d\n", r);
+      ret = 1;
+    }
+  return ret;
+}

	Jakub

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] Fix re_search with multibyte locales other than UTF-8
  2006-06-02 14:39 [PATCH] Fix re_search with multibyte locales other than UTF-8 Jakub Jelinek
@ 2006-06-04  4:59 ` Ulrich Drepper
  0 siblings, 0 replies; 2+ messages in thread
From: Ulrich Drepper @ 2006-06-04  4:59 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Glibc hackers

[-- Attachment #1: Type: text/plain, Size: 101 bytes --]

Applied.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 251 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2006-06-04  4:59 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-06-02 14:39 [PATCH] Fix re_search with multibyte locales other than UTF-8 Jakub Jelinek
2006-06-04  4:59 ` Ulrich Drepper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).