* RFC: regex init_string
@ 2003-11-15 0:04 Jakub Jelinek
0 siblings, 0 replies; only message in thread
From: Jakub Jelinek @ 2003-11-15 0:04 UTC (permalink / raw)
To: Ulrich Drepper; +Cc: Glibc hackers
Hi!
Many regular expressions start with quite a long initial string.
This patch computes it (e.g. for "a(bc|bcd)" the initial string is "abc")
and tries to optimize it (if the initial string is single char, then
using memchr, otherwise memmem (which doesn't seem to be optimized too
much)). I haven't done any serious benchmarking and until this is
done and the speedups are on average worth the time spent in computing
the string, it doesn't make sense to apply this.
Maybe it might be useful to add Boyer-Moore if the initial string is longer,
only testing will show.
2003-11-15 Jakub Jelinek <jakub@redhat.com>
* posix/regex_internal.h (struct re_dfa_t): Add init_string
and init_string_len fields.
* posix/regcomp.c (re_compile_fastmap): Free and clear init_string
first or if it doesn't contain any chars afterwards, initialize
init_string_len.
(re_compile_fastmap_iter): Compute init_string.
(free_dfa_content): Free init_string.
* posix/regexec.c (re_search_internal): Optimize using init_string.
* posix/bug-regex21.c: New test.
* posix/tst-regex.c: Add test without special characters.
* posix/Makefile (tests): Add bug-regex21.
(bug-regex21-ENV): New.
--- libc/posix/regex_internal.h.jj 2003-11-12 21:33:35.000000000 +0100
+++ libc/posix/regex_internal.h 2003-11-14 23:21:21.000000000 +0100
@@ -611,6 +611,9 @@ struct re_dfa_t
unsigned int has_mb_node : 1;
unsigned int is_utf8 : 1;
int mb_cur_max;
+ /* Starting string (computed together with fastmap). */
+ unsigned char *init_string;
+ size_t init_string_len;
};
typedef struct re_dfa_t re_dfa_t;
--- libc/posix/regcomp.c.jj 2003-11-13 22:43:39.000000000 +0100
+++ libc/posix/regcomp.c 2003-11-15 00:44:05.000000000 +0100
@@ -281,6 +281,8 @@ re_compile_fastmap (bufp)
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
char *fastmap = bufp->fastmap;
+ re_free (dfa->init_string);
+ dfa->init_string = NULL;
memset (fastmap, '\0', sizeof (char) * SBC_MAX);
re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
if (dfa->init_state != dfa->init_state_word)
@@ -289,6 +291,16 @@ re_compile_fastmap (bufp)
re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
if (dfa->init_state != dfa->init_state_begbuf)
re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
+ if (dfa->init_string)
+ {
+ if (fastmap[0] || !dfa->init_string[0])
+ {
+ re_free (dfa->init_string);
+ dfa->init_string = NULL;
+ }
+ else
+ dfa->init_string_len = strlen (dfa->init_string);
+ }
bufp->fastmap_accurate = 1;
return 0;
}
@@ -335,6 +347,7 @@ re_compile_fastmap_iter (bufp, init_stat
p = buf;
*p++ = dfa->nodes[node].opr.c;
while (++node < dfa->nodes_len
+ && p < buf + dfa->mb_cur_max
&& dfa->nodes[node].type == CHARACTER
&& dfa->nodes[node].mb_partial)
*p++ = dfa->nodes[node].opr.c;
@@ -345,6 +358,86 @@ re_compile_fastmap_iter (bufp, init_stat
re_set_fastmap (fastmap, 0, buf[0]);
}
#endif
+ if ((dfa->mb_cur_max == 1 || dfa->is_utf8)
+ && !(bufp->syntax & RE_ICASE))
+ {
+ if (dfa->init_string == NULL)
+ {
+ size_t len = 2;
+ unsigned char *p;
+ int tnode;
+
+ tnode = node;
+ while ((tnode = dfa->nexts[tnode]) != -1)
+ {
+ switch (dfa->nodes[tnode].type)
+ {
+ case CHARACTER:
+ ++len;
+ continue;
+ case OP_OPEN_SUBEXP:
+ case OP_CLOSE_SUBEXP:
+ continue;
+ default:
+ break;
+ }
+ break;
+ }
+
+ p = re_malloc (char, len);
+ if (p == NULL)
+ /* Signal to parent that dfa->init_string will not
+ be valid. */
+ fastmap[0] = 1;
+ else
+ {
+ dfa->init_string = p;
+ for (tnode = node; tnode != -1;
+ tnode = dfa->nexts[tnode])
+ {
+ switch (dfa->nodes[tnode].type)
+ {
+ case CHARACTER:
+ *p++ = dfa->nodes[tnode].opr.c;
+ continue;
+ case OP_OPEN_SUBEXP:
+ case OP_CLOSE_SUBEXP:
+ continue;
+ default:
+ break;
+ }
+ break;
+ }
+ *p = '\0';
+ }
+ }
+ else
+ {
+ unsigned char *p;
+ int tnode;
+
+ for (tnode = node, p = dfa->init_string;
+ tnode != -1;
+ tnode = dfa->nexts[tnode])
+ {
+ switch (dfa->nodes[tnode].type)
+ {
+ case CHARACTER:
+ if (*p != dfa->nodes[tnode].opr.c)
+ break;
+ p++;
+ continue;
+ case OP_OPEN_SUBEXP:
+ case OP_CLOSE_SUBEXP:
+ continue;
+ default:
+ break;
+ }
+ *p = '\0';
+ break;
+ }
+ }
+ }
}
else if (type == SIMPLE_BRACKET)
{
@@ -353,6 +446,8 @@ re_compile_fastmap_iter (bufp, init_stat
for (j = 0; j < UINT_BITS; ++j, ++ch)
if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
re_set_fastmap (fastmap, icase, ch);
+ if (dfa->init_string)
+ *dfa->init_string = '\0';
}
#ifdef RE_ENABLE_I18N
else if (type == COMPLEX_BRACKET)
@@ -399,6 +494,8 @@ re_compile_fastmap_iter (bufp, init_stat
re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
}
}
+ if (dfa->init_string)
+ *dfa->init_string = '\0';
}
#endif /* RE_ENABLE_I18N */
else if (type == END_OF_RE || type == OP_PERIOD)
@@ -406,6 +503,8 @@ re_compile_fastmap_iter (bufp, init_stat
memset (fastmap, '\1', sizeof (char) * SBC_MAX);
if (type == END_OF_RE)
bufp->can_be_null = 1;
+ if (dfa->init_string)
+ *dfa->init_string = '\0';
return;
}
}
@@ -604,6 +703,7 @@ free_dfa_content (re_dfa_t *dfa)
#ifdef DEBUG
re_free (dfa->re_str);
#endif
+ re_free (dfa->init_string);
re_free (dfa);
}
--- libc/posix/regexec.c.jj 2003-11-12 21:33:35.000000000 +0100
+++ libc/posix/regexec.c 2003-11-14 23:53:43.000000000 +0100
@@ -584,6 +584,7 @@ re_search_internal (preg, string, length
{
reg_errcode_t err;
re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
+ const unsigned char *init_string = NULL;
re_string_t input;
int left_lim, right_lim, incr;
int fl_longest_match, match_first, match_last = -1;
@@ -646,11 +647,42 @@ re_search_internal (preg, string, length
right_lim = (range < 0) ? start : start + range;
sb = dfa->mb_cur_max == 1;
fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
+ if (range > 0 && fastmap && !preg->translate)
+ init_string = dfa->init_string;
for (;;)
{
/* At first get the current byte from input string. */
- if (fastmap)
+ if (init_string)
+ {
+ const char *p;
+
+ if (!init_string[1])
+ p = memchr (string + match_first, init_string[0],
+ right_lim - match_first);
+ else
+ {
+ int mlim = right_lim + dfa->init_string_len;
+
+ if (mlim > length)
+ mlim = length;
+ p = memmem (string + match_first,
+ mlim - match_first,
+ init_string, dfa->init_string_len);
+ }
+
+ if (BE (p != NULL, 1))
+ match_first = p - string;
+ else
+ {
+ match_first = right_lim;
+ if (match_first >= length)
+ break;
+ if ((unsigned char) string[match_first] != init_string[0])
+ break;
+ }
+ }
+ else if (fastmap)
{
if (BE (fast_translate, 1))
{
--- libc/posix/bug-regex21.c.jj 2003-11-14 22:39:28.000000000 +0100
+++ libc/posix/bug-regex21.c 2003-11-15 00:35:15.000000000 +0100
@@ -0,0 +1,238 @@
+/* Test for init_string regular expression optimizations.
+ Copyright (C) 2003 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sys/types.h>
+#include <mcheck.h>
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+
+#define RE_NO_INTERNAL_PROTOTYPES 1
+#include "regex_internal.h"
+
+#define BRE RE_SYNTAX_POSIX_BASIC
+#define ERE RE_SYNTAX_POSIX_EXTENDED
+
+static struct
+{
+ int syntax;
+ const char *pattern;
+ const char *string;
+ int res;
+ const char *init_string;
+} tests[] = {
+ /* \xc3\x84 LATIN CAPITAL LETTER A WITH DIAERESIS
+ \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS
+ \xc3\xa4 LATIN SMALL LETTER A WITH DIAERESIS
+ \xc3\xb6 LATIN SMALL LETTER O WITH DIAERESIS
+ \xe2\x80\x94 EM DASH */
+ {BRE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"},
+ {BRE, "f*oo", "b\xc3\xa4rfoob\xc3\xa4z", 4, NULL},
+ {BRE, "f\\(o\\)o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"},
+ {BRE, "f\\(o\\)\\+o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "fo"},
+ {BRE, "f\\(o\\)\\?o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"},
+ {BRE, "f\\(o\\)*o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"},
+ {BRE, "fo*b", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"},
+ {BRE, "fo\\+b", "b\xc3\xa4rfoob\xc3\xa4z", 4, "fo"},
+ {BRE, "fo\\?b", "b\xc3\xa4rfb\xc3\xa4z", 4, "f"},
+ {BRE, "^b\xc3\xa4r", "b\xc3\xa4rfoob\xc3\xa4z", 0, "b\xc3\xa4r"},
+ {BRE, "\\(fo\\)o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"},
+ {BRE, "f\\(oo\\|bar\\)", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"},
+ {BRE, "\\(fo\\|ba\\)", "b\xc3\xa4rfoob\xc3\xa4z", 4, NULL},
+ {BRE, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, "x"},
+ {BRE, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, "x\\yyyyyyz"},
+ {BRE, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, "x\\yy"},
+ {BRE, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, "x\\"},
+ {BRE, "x[C]y", "axCy", 1, "x"},
+ {BRE, "x[ABC]y", "axCy", 1, "x"},
+ {BRE, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, NULL},
+ {BRE, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, "xyz"},
+ {BRE, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, "x"},
+ {BRE, "x.y", "ax\xe2\x80\x94yz", 1, "x"},
+ {BRE, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, "x"},
+ {BRE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, "x"},
+ {BRE, "x[A-Z,]y", "axCy", 1, "x"},
+ {BRE, "x[^y]z", "ax\xe2\x80\x94z", 1, "x"},
+ {BRE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, "x"},
+ {BRE, "x[[=A=]]z", "axAz", 1, "x"},
+ {BRE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, "x"},
+ {BRE, "\\<g", "\xe2\x80\x94g", 3, "g"},
+ {BRE, "\\bg\\b", "\xe2\x80\x94g", 3, "g"},
+ {BRE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, "g"},
+ {BRE, "a\\wz", "a\xc3\x84z", 0, "a"},
+ {BRE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, "x"},
+ {ERE, "(ab|ab*)bc", "abc", 0, "a"},
+ {ERE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"},
+ {ERE, "f*oo", "b\xc3\xa4rfoob\xc3\xa4z", 4, NULL},
+ {ERE, "f(o)o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"},
+ {ERE, "f(o)+o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "fo"},
+ {ERE, "f(o)?o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"},
+ {ERE, "f(o)*o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"},
+ {ERE, "fo*b", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"},
+ {ERE, "fo+b", "b\xc3\xa4rfoob\xc3\xa4z", 4, "fo"},
+ {ERE, "fo?b", "b\xc3\xa4rfb\xc3\xa4z", 4, "f"},
+ {ERE, "^b\xc3\xa4r", "b\xc3\xa4rfoob\xc3\xa4z", 0, "b\xc3\xa4r"},
+ {ERE, "(fo)o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"},
+ {ERE, "f(oo|bar)", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"},
+ {ERE, "(fo|ba)", "b\xc3\xa4rfoob\xc3\xa4z", 4, NULL},
+ {ERE, "^x|xy*z$", "\xc3\xb6xyyz", 2, "x"},
+ {ERE, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, "x\\yyyyyyz"},
+ {ERE, "^x\\\\y{2,36}z+", "x\\yzz\xc3\xb6", -1, "x\\yy"},
+ {ERE, "^x\\\\y{,3}z+", "x\\yyyzz\xc3\xb6", 0, "x\\"},
+ {ERE, "x[C]y", "axCy", 1, "x"},
+ {ERE, "x[ABC]y", "axCy", 1, "x"},
+ {ERE, "\\`x|z\\'", "x\xe2\x80\x94", 0, NULL},
+ {ERE, "(xy)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, "xyz"},
+ {ERE, "xy?z", "\xc3\x84xz\xc3\xb6", 2, "x"},
+ {ERE, "x.y", "ax\xe2\x80\x94yz", 1, "x"},
+ {ERE, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, "x"},
+ {ERE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, "x"},
+ {ERE, "x[A-Z,]y", "axCy", 1, "x"},
+ {ERE, "x[^y]z", "ax\xe2\x80\x94z", 1, "x"},
+ {ERE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, "x"},
+ {ERE, "x[[=A=]]z", "axAz", 1, "x"},
+ {ERE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, "x"},
+ {ERE, "\\<g", "\xe2\x80\x94g", 3, "g"},
+ {ERE, "\\bg\\b", "\xe2\x80\x94g", 3, "g"},
+ {ERE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, "g"},
+ {ERE, "a\\wz", "a\xc3\x84z", 0, "a"},
+ {ERE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, "x"}
+};
+
+int
+main (void)
+{
+ struct re_pattern_buffer regbuf;
+ const char *err;
+ size_t i;
+ int ret = 0;
+
+ mtrace ();
+
+ setlocale (LC_ALL, "de_DE.UTF-8");
+ for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
+ {
+ int res;
+
+ re_set_syntax (tests[i].syntax);
+ memset (®buf, '\0', sizeof (regbuf));
+ err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
+ ®buf);
+ if (err != NULL)
+ {
+ printf ("re_compile_pattern failed: %s\n", err);
+ ret = 1;
+ continue;
+ }
+
+ if (((re_dfa_t *) regbuf.buffer)->init_string != NULL)
+ {
+ printf ("init_string != NULL in test %zd\n", i);
+ regfree (®buf);
+ ret = 1;
+ continue;
+ }
+
+ regbuf.fastmap = malloc (256);
+ re_compile_fastmap (®buf);
+ if (tests[i].init_string == NULL)
+ {
+ if (((re_dfa_t *) regbuf.buffer)->init_string != NULL)
+ {
+ printf ("init_string \"%s\" in test %zd, expected NULL\n",
+ ((re_dfa_t *) regbuf.buffer)->init_string, i);
+ ret = 1;
+ }
+ }
+ else if (((re_dfa_t *) regbuf.buffer)->init_string == NULL)
+ {
+ printf ("init_string NULL in test %zd, expected \"%s\"\n",
+ i, tests[i].init_string);
+ ret = 1;
+ }
+ else if (strcmp (tests[i].init_string,
+ ((re_dfa_t *) regbuf.buffer)->init_string))
+ {
+ printf ("init_string \"%s\" in test %zd, expected \"%s\"\n",
+ ((re_dfa_t *) regbuf.buffer)->init_string,
+ i, tests[i].init_string);
+ ret = 1;
+ }
+ if (((re_dfa_t *) regbuf.buffer)->init_string != NULL)
+ {
+ int j, c = ((re_dfa_t *) regbuf.buffer)->init_string [0];
+
+ for (j = 0; j < 256; ++j)
+ if (regbuf.fastmap[j] != (j == c))
+ {
+ printf ("init_string \"%s\" doesn't match fastmap (%02x)\n",
+ ((re_dfa_t *) regbuf.buffer)->init_string, j);
+ ret = 1;
+ break;
+ }
+ }
+
+ res = re_search (®buf, tests[i].string, strlen (tests[i].string), 0,
+ strlen (tests[i].string), NULL);
+ if (res != tests[i].res)
+ {
+ printf ("re_search %zd failed: %d\n", i, res);
+ ret = 1;
+ regfree (®buf);
+ continue;
+ }
+ regfree (®buf);
+
+ re_set_syntax (tests[i].syntax | RE_ICASE);
+ memset (®buf, '\0', sizeof (regbuf));
+ regbuf.fastmap = malloc (256);
+ err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
+ ®buf);
+ if (err != NULL)
+ {
+ printf ("re_compile_pattern failed: %s\n", err);
+ ret = 1;
+ continue;
+ }
+
+ if (((re_dfa_t *) regbuf.buffer)->init_string != NULL)
+ {
+ printf ("init_string != NULL in ICASE test %zd\n", i);
+ regfree (®buf);
+ ret = 1;
+ continue;
+ }
+
+ res = re_search (®buf, tests[i].string, strlen (tests[i].string), 0,
+ strlen (tests[i].string), NULL);
+ if (res != tests[i].res)
+ {
+ printf ("re_search %zd failed: %d\n", i, res);
+ ret = 1;
+ regfree (®buf);
+ continue;
+ }
+ regfree (®buf);
+ }
+
+ return ret;
+}
--- libc/posix/Makefile.jj 2003-11-13 22:04:19.000000000 +0100
+++ libc/posix/Makefile 2003-11-14 23:32:09.000000000 +0100
@@ -78,7 +78,7 @@ tests := tstgetopt testfnm runtests run
bug-regex8 bug-regex9 bug-regex10 bug-regex11 bug-regex12 \
bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
- tst-nice tst-nanosleep transbug tst-rxspencer
+ bug-regex21 tst-nice tst-nanosleep transbug tst-rxspencer
ifeq (yes,$(build-shared))
test-srcs := globtest
tests += wordexp-test tst-exec tst-spawn
@@ -159,6 +159,7 @@ bug-regex17-ENV = LOCPATH=$(common-objpf
bug-regex18-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex19-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata
+bug-regex21-ENV = LOCPATH=$(common-objpfx)localedata
testcases.h: TESTS TESTS2C.sed
sed -f TESTS2C.sed < $< > $@T
--- libc/posix/tst-regex.c.jj 2003-11-12 21:33:35.000000000 +0100
+++ libc/posix/tst-regex.c 2003-11-14 23:59:09.000000000 +0100
@@ -121,6 +121,7 @@ main (void)
/* Run the actual tests. All tests are run in a single-byte and a
multi-byte locale. */
result = test_expr ("[äáà âéèêÃìîñöóòôüúùû]", 2, 2);
+ result |= test_expr ("Uddeborg", 2, 2);
result |= test_expr ("G.ran", 2, 3);
result |= test_expr ("G.\\{1\\}ran", 2, 3);
result |= test_expr ("G.*ran", 3, 44);
Jakub
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2003-11-15 0:04 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-11-15 0:04 RFC: regex init_string Jakub Jelinek
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).