* [PATCH] Speed up regex UTF-8 handling (take 2)
@ 2003-11-12 13:47 Jakub Jelinek
0 siblings, 0 replies; only message in thread
From: Jakub Jelinek @ 2003-11-12 13:47 UTC (permalink / raw)
To: Ulrich Drepper, Roland McGrath; +Cc: Glibc hackers
Hi!
It seems there are no interesting charsets with similar property as UTF-8
for backward searches, so this patch (which relies on the infrastructure
patch I've just posted) simply special cases UTF-8 instead of adding hooks.
2003-11-12 Jakub Jelinek <jakub@redhat.com>
* posix/tst-regex.c (umemlen): New variable.
(test_expr): Add expectedicase argument. Test case insensitive
searches as well as backwards searches (case sensitive and
insensitive) too.
(run_test): Add icase argument. Use it to compute regcomp flags.
(run_test_backwards): New function.
(main): Cast read to size_t to avoid warning. Set umemlen.
Add expectedicase arguments to test_expr.
* posix/regex_internal.c (re_string_reconstruct): If is_utf8,
find previous character by walking back instead of converting
all chars from beginning.
--- libc/posix/regex_internal.c.jj 2003-11-12 08:26:45.000000000 +0100
+++ libc/posix/regex_internal.c 2003-11-12 09:48:11.000000000 +0100
@@ -438,10 +438,40 @@ re_string_reconstruct (pstr, idx, eflags
if (pstr->mb_cur_max > 1)
{
int wcs_idx;
- wint_t wc;
- pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
- for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
- pstr->wcs[wcs_idx] = WEOF;
+ wint_t wc = WEOF;
+
+ if (pstr->is_utf8)
+ {
+ const unsigned char *raw, *p, *end;
+
+ /* Special case UTF-8. Multi-byte chars start with any
+ byte other than 0x80 - 0xbf. */
+ raw = pstr->raw_mbs + pstr->raw_mbs_idx;
+ end = raw + (pstr->valid_len > offset - pstr->mb_cur_max
+ ? pstr->valid_len : offset - pstr->mb_cur_max);
+ for (p = raw + offset - 1; p >= end; --p)
+ if ((*p & 0xc0) != 0x80)
+ {
+ mbstate_t cur_state;
+ wchar_t wc2;
+
+ memset (&cur_state, 0, sizeof (cur_state));
+ if (mbrtowc (&wc2, p, raw + offset - p, &cur_state)
+ == raw + offset - p)
+ {
+ memset (&pstr->cur_state, '\0',
+ sizeof (mbstate_t));
+ wc = wc2;
+ }
+ break;
+ }
+ }
+ if (wc == WEOF)
+ {
+ pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+ for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
+ pstr->wcs[wcs_idx] = WEOF;
+ }
if (pstr->trans && wc <= 0xff)
wc = pstr->trans[wc];
pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
--- libc/posix/tst-regex.c.jj 2001-07-06 06:55:38.000000000 +0200
+++ libc/posix/tst-regex.c 2003-11-12 09:32:11.000000000 +0100
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001 Free Software Foundation, Inc.
+/* Copyright (C) 2001, 2003 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -44,10 +44,13 @@ static iconv_t cd;
static char *mem;
static char *umem;
static size_t memlen;
+static size_t umemlen;
-static int test_expr (const char *expr, int expected);
+static int test_expr (const char *expr, int expected, int expectedicase);
static int run_test (const char *expr, const char *mem, size_t memlen,
- int expected);
+ int icase, int expected);
+static int run_test_backwards (const char *expr, const char *mem,
+ size_t memlen, int icase, int expected);
int
@@ -78,7 +81,7 @@ main (void)
if (mem == NULL)
error (EXIT_FAILURE, errno, "while allocating buffer");
- if (read (fd, mem, memlen) != memlen)
+ if ((size_t) read (fd, mem, memlen) != memlen)
error (EXIT_FAILURE, 0, "cannot read entire file");
mem[memlen] = '\0';
@@ -102,6 +105,7 @@ main (void)
outmem = umem;
outlen = 2 * memlen - 1;
iconv (cd, &inmem, &inlen, &outmem, &outlen);
+ umemlen = outmem - umem;
if (inlen != 0)
error (EXIT_FAILURE, errno, "cannot convert buffer");
@@ -116,11 +120,11 @@ main (void)
/* Run the actual tests. All tests are run in a single-byte and a
multi-byte locale. */
- result = test_expr ("[äáà âéèêÃìîñöóòôüúùû]", 2);
- result |= test_expr ("G.ran", 2);
- result |= test_expr ("G.\\{1\\}ran", 2);
- result |= test_expr ("G.*ran", 3);
- result |= test_expr ("[äáà â]", 0);
+ result = test_expr ("[äáà âéèêÃìîñöóòôüúùû]", 2, 2);
+ result |= test_expr ("G.ran", 2, 3);
+ result |= test_expr ("G.\\{1\\}ran", 2, 3);
+ result |= test_expr ("G.*ran", 3, 44);
+ result |= test_expr ("[äáà â]", 0, 0);
/* Free the resources. */
free (umem);
@@ -132,7 +136,7 @@ main (void)
static int
-test_expr (const char *expr, int expected)
+test_expr (const char *expr, int expected, int expectedicase)
{
int result;
char *inmem;
@@ -146,7 +150,14 @@ test_expr (const char *expr, int expecte
error (EXIT_FAILURE, 0, "cannot set locale de_DE.ISO-8859-1");
printf ("\nTest \"%s\" with 8-bit locale\n", expr);
- result = run_test (expr, mem, memlen, expected);
+ result = run_test (expr, mem, memlen, 0, expected);
+ printf ("\nTest \"%s\" with 8-bit locale, case insensitive\n", expr);
+ result |= run_test (expr, mem, memlen, 1, expectedicase);
+ printf ("\nTest \"%s\" backwards with 8-bit locale\n", expr);
+ result |= run_test_backwards (expr, mem, memlen, 0, expected);
+ printf ("\nTest \"%s\" backwards with 8-bit locale, case insensitive\n",
+ expr);
+ result |= run_test_backwards (expr, mem, memlen, 1, expectedicase);
/* Second test: search with an UTF-8 locale. */
if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL)
@@ -163,14 +174,22 @@ test_expr (const char *expr, int expecte
/* Run the tests. */
printf ("\nTest \"%s\" with multi-byte locale\n", expr);
- result |= run_test (uexpr, umem, 2 * memlen - outlen, expected);
+ result |= run_test (uexpr, umem, umemlen, 0, expected);
+ printf ("\nTest \"%s\" with multi-byte locale, case insensitive\n", expr);
+ result |= run_test (uexpr, umem, umemlen, 1, expectedicase);
+ printf ("\nTest \"%s\" backwards with multi-byte locale\n", expr);
+ result |= run_test_backwards (uexpr, umem, umemlen, 0, expected);
+ printf ("\nTest \"%s\" backwards with multi-byte locale, case insensitive\n",
+ expr);
+ result |= run_test_backwards (uexpr, umem, umemlen, 1, expectedicase);
return result;
}
static int
-run_test (const char *expr, const char *mem, size_t memlen, int expected)
+run_test (const char *expr, const char *mem, size_t memlen, int icase,
+ int expected)
{
#ifdef _POSIX_CPUTIME
struct timespec start;
@@ -186,7 +205,7 @@ run_test (const char *expr, const char *
use_clock = clock_gettime (cl, &start) == 0;
#endif
- err = regcomp (&re, expr, REG_NEWLINE);
+ err = regcomp (&re, expr, REG_NEWLINE | (icase ? REG_ICASE : 0));
if (err != REG_NOERROR)
{
char buf[200];
@@ -257,3 +276,97 @@ run_test (const char *expr, const char *
expect. */
return cnt != expected;
}
+
+
+static int
+run_test_backwards (const char *expr, const char *mem, size_t memlen,
+ int icase, int expected)
+{
+#ifdef _POSIX_CPUTIME
+ struct timespec start;
+ struct timespec finish;
+#endif
+ struct re_pattern_buffer re;
+ const char *err;
+ size_t offset;
+ int cnt;
+
+#ifdef _POSIX_CPUTIME
+ if (use_clock)
+ use_clock = clock_gettime (cl, &start) == 0;
+#endif
+
+ re_set_syntax ((RE_SYNTAX_POSIX_BASIC & ~RE_DOT_NEWLINE)
+ | RE_HAT_LISTS_NOT_NEWLINE
+ | (icase ? RE_ICASE : 0));
+
+ memset (&re, 0, sizeof (re));
+ re.fastmap = malloc (256);
+ if (re.fastmap == NULL)
+ error (EXIT_FAILURE, errno, "cannot allocate fastmap");
+
+ err = re_compile_pattern (expr, strlen (expr), &re);
+ if (err != NULL)
+ error (EXIT_FAILURE, 0, "cannot compile expression: %s", err);
+
+ if (re_compile_fastmap (&re))
+ error (EXIT_FAILURE, 0, "couldn't compile fastmap");
+
+ cnt = 0;
+ offset = memlen;
+ assert (mem[memlen] == '\0');
+ while (offset <= memlen)
+ {
+ int start;
+ const char *sp;
+ const char *ep;
+
+ start = re_search (&re, mem, memlen, offset, -offset, NULL);
+ if (start == -1)
+ break;
+
+ if (start == -2)
+ error (EXIT_FAILURE, 0, "internal error in re_search");
+
+ sp = mem + start;
+ while (sp > mem && sp[-1] != '\n')
+ --sp;
+
+ ep = mem + start;
+ while (*ep != '\0' && *ep != '\n')
+ ++ep;
+
+ printf ("match %d: \"%.*s\"\n", ++cnt, (int) (ep - sp), sp);
+
+ offset = sp - 1 - mem;
+ }
+
+ regfree (&re);
+
+#ifdef _POSIX_CPUTIME
+ if (use_clock)
+ {
+ use_clock = clock_gettime (cl, &finish) == 0;
+ if (use_clock)
+ {
+ if (finish.tv_nsec < start.tv_nsec)
+ {
+ finish.tv_nsec -= start.tv_nsec - 1000000000;
+ finish.tv_sec -= 1 + start.tv_sec;
+ }
+ else
+ {
+ finish.tv_nsec -= start.tv_nsec;
+ finish.tv_sec -= start.tv_sec;
+ }
+
+ printf ("elapsed time: %ld.%09ld sec\n",
+ finish.tv_sec, finish.tv_nsec);
+ }
+ }
+#endif
+
+ /* Return an error if the number of matches found is not match we
+ expect. */
+ return cnt != expected;
+}
Jakub
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2003-11-12 13:47 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-11-12 13:47 [PATCH] Speed up regex UTF-8 handling (take 2) Jakub Jelinek
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).