From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 32345 invoked by alias); 15 Nov 2003 00:04:39 -0000 Mailing-List: contact libc-hacker-help@sources.redhat.com; run by ezmlm Precedence: bulk List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-hacker-owner@sources.redhat.com Received: (qmail 32328 invoked from network); 15 Nov 2003 00:04:38 -0000 Received: from unknown (HELO sunsite.ms.mff.cuni.cz) (195.113.19.66) by sources.redhat.com with SMTP; 15 Nov 2003 00:04:38 -0000 Received: from sunsite.ms.mff.cuni.cz (sunsite.mff.cuni.cz [127.0.0.1]) by sunsite.ms.mff.cuni.cz (8.12.8/8.12.8) with ESMTP id hAEM062c000794; Fri, 14 Nov 2003 23:00:06 +0100 Received: (from jakub@localhost) by sunsite.ms.mff.cuni.cz (8.12.8/8.12.8/Submit) id hAEM052c000790; Fri, 14 Nov 2003 23:00:05 +0100 Date: Sat, 15 Nov 2003 00:04:00 -0000 From: Jakub Jelinek To: Ulrich Drepper Cc: Glibc hackers Subject: RFC: regex init_string Message-ID: <20031114220005.GD12344@sunsite.ms.mff.cuni.cz> Reply-To: Jakub Jelinek Mime-Version: 1.0 Content-Type: text/plain; charset=iso-8859-1 Content-Disposition: inline Content-Transfer-Encoding: 8bit User-Agent: Mutt/1.4i X-SW-Source: 2003-11/txt/msg00073.txt.bz2 Hi! Many regular expressions start with quite a long initial string. This patch computes it (e.g. for "a(bc|bcd)" the initial string is "abc") and tries to optimize it (if the initial string is single char, then using memchr, otherwise memmem (which doesn't seem to be optimized too much)). I haven't done any serious benchmarking and until this is done and the speedups are on average worth the time spent in computing the string, it doesn't make sense to apply this. Maybe it might be useful to add Boyer-Moore if the initial string is longer, only testing will show. 2003-11-15 Jakub Jelinek * posix/regex_internal.h (struct re_dfa_t): Add init_string and init_string_len fields. * posix/regcomp.c (re_compile_fastmap): Free and clear init_string first or if it doesn't contain any chars afterwards, initialize init_string_len. (re_compile_fastmap_iter): Compute init_string. (free_dfa_content): Free init_string. * posix/regexec.c (re_search_internal): Optimize using init_string. * posix/bug-regex21.c: New test. * posix/tst-regex.c: Add test without special characters. * posix/Makefile (tests): Add bug-regex21. (bug-regex21-ENV): New. --- libc/posix/regex_internal.h.jj 2003-11-12 21:33:35.000000000 +0100 +++ libc/posix/regex_internal.h 2003-11-14 23:21:21.000000000 +0100 @@ -611,6 +611,9 @@ struct re_dfa_t unsigned int has_mb_node : 1; unsigned int is_utf8 : 1; int mb_cur_max; + /* Starting string (computed together with fastmap). */ + unsigned char *init_string; + size_t init_string_len; }; typedef struct re_dfa_t re_dfa_t; --- libc/posix/regcomp.c.jj 2003-11-13 22:43:39.000000000 +0100 +++ libc/posix/regcomp.c 2003-11-15 00:44:05.000000000 +0100 @@ -281,6 +281,8 @@ re_compile_fastmap (bufp) re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; char *fastmap = bufp->fastmap; + re_free (dfa->init_string); + dfa->init_string = NULL; memset (fastmap, '\0', sizeof (char) * SBC_MAX); re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); if (dfa->init_state != dfa->init_state_word) @@ -289,6 +291,16 @@ re_compile_fastmap (bufp) re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); if (dfa->init_state != dfa->init_state_begbuf) re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); + if (dfa->init_string) + { + if (fastmap[0] || !dfa->init_string[0]) + { + re_free (dfa->init_string); + dfa->init_string = NULL; + } + else + dfa->init_string_len = strlen (dfa->init_string); + } bufp->fastmap_accurate = 1; return 0; } @@ -335,6 +347,7 @@ re_compile_fastmap_iter (bufp, init_stat p = buf; *p++ = dfa->nodes[node].opr.c; while (++node < dfa->nodes_len + && p < buf + dfa->mb_cur_max && dfa->nodes[node].type == CHARACTER && dfa->nodes[node].mb_partial) *p++ = dfa->nodes[node].opr.c; @@ -345,6 +358,86 @@ re_compile_fastmap_iter (bufp, init_stat re_set_fastmap (fastmap, 0, buf[0]); } #endif + if ((dfa->mb_cur_max == 1 || dfa->is_utf8) + && !(bufp->syntax & RE_ICASE)) + { + if (dfa->init_string == NULL) + { + size_t len = 2; + unsigned char *p; + int tnode; + + tnode = node; + while ((tnode = dfa->nexts[tnode]) != -1) + { + switch (dfa->nodes[tnode].type) + { + case CHARACTER: + ++len; + continue; + case OP_OPEN_SUBEXP: + case OP_CLOSE_SUBEXP: + continue; + default: + break; + } + break; + } + + p = re_malloc (char, len); + if (p == NULL) + /* Signal to parent that dfa->init_string will not + be valid. */ + fastmap[0] = 1; + else + { + dfa->init_string = p; + for (tnode = node; tnode != -1; + tnode = dfa->nexts[tnode]) + { + switch (dfa->nodes[tnode].type) + { + case CHARACTER: + *p++ = dfa->nodes[tnode].opr.c; + continue; + case OP_OPEN_SUBEXP: + case OP_CLOSE_SUBEXP: + continue; + default: + break; + } + break; + } + *p = '\0'; + } + } + else + { + unsigned char *p; + int tnode; + + for (tnode = node, p = dfa->init_string; + tnode != -1; + tnode = dfa->nexts[tnode]) + { + switch (dfa->nodes[tnode].type) + { + case CHARACTER: + if (*p != dfa->nodes[tnode].opr.c) + break; + p++; + continue; + case OP_OPEN_SUBEXP: + case OP_CLOSE_SUBEXP: + continue; + default: + break; + } + *p = '\0'; + break; + } + } + } } else if (type == SIMPLE_BRACKET) { @@ -353,6 +446,8 @@ re_compile_fastmap_iter (bufp, init_stat for (j = 0; j < UINT_BITS; ++j, ++ch) if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) re_set_fastmap (fastmap, icase, ch); + if (dfa->init_string) + *dfa->init_string = '\0'; } #ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) @@ -399,6 +494,8 @@ re_compile_fastmap_iter (bufp, init_stat re_set_fastmap (fastmap, 0, *(unsigned char *) buf); } } + if (dfa->init_string) + *dfa->init_string = '\0'; } #endif /* RE_ENABLE_I18N */ else if (type == END_OF_RE || type == OP_PERIOD) @@ -406,6 +503,8 @@ re_compile_fastmap_iter (bufp, init_stat memset (fastmap, '\1', sizeof (char) * SBC_MAX); if (type == END_OF_RE) bufp->can_be_null = 1; + if (dfa->init_string) + *dfa->init_string = '\0'; return; } } @@ -604,6 +703,7 @@ free_dfa_content (re_dfa_t *dfa) #ifdef DEBUG re_free (dfa->re_str); #endif + re_free (dfa->init_string); re_free (dfa); } --- libc/posix/regexec.c.jj 2003-11-12 21:33:35.000000000 +0100 +++ libc/posix/regexec.c 2003-11-14 23:53:43.000000000 +0100 @@ -584,6 +584,7 @@ re_search_internal (preg, string, length { reg_errcode_t err; re_dfa_t *dfa = (re_dfa_t *)preg->buffer; + const unsigned char *init_string = NULL; re_string_t input; int left_lim, right_lim, incr; int fl_longest_match, match_first, match_last = -1; @@ -646,11 +647,42 @@ re_search_internal (preg, string, length right_lim = (range < 0) ? start : start + range; sb = dfa->mb_cur_max == 1; fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate); + if (range > 0 && fastmap && !preg->translate) + init_string = dfa->init_string; for (;;) { /* At first get the current byte from input string. */ - if (fastmap) + if (init_string) + { + const char *p; + + if (!init_string[1]) + p = memchr (string + match_first, init_string[0], + right_lim - match_first); + else + { + int mlim = right_lim + dfa->init_string_len; + + if (mlim > length) + mlim = length; + p = memmem (string + match_first, + mlim - match_first, + init_string, dfa->init_string_len); + } + + if (BE (p != NULL, 1)) + match_first = p - string; + else + { + match_first = right_lim; + if (match_first >= length) + break; + if ((unsigned char) string[match_first] != init_string[0]) + break; + } + } + else if (fastmap) { if (BE (fast_translate, 1)) { --- libc/posix/bug-regex21.c.jj 2003-11-14 22:39:28.000000000 +0100 +++ libc/posix/bug-regex21.c 2003-11-15 00:35:15.000000000 +0100 @@ -0,0 +1,238 @@ +/* Test for init_string regular expression optimizations. + Copyright (C) 2003 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Jakub Jelinek , 2003. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include +#include +#include + +#define RE_NO_INTERNAL_PROTOTYPES 1 +#include "regex_internal.h" + +#define BRE RE_SYNTAX_POSIX_BASIC +#define ERE RE_SYNTAX_POSIX_EXTENDED + +static struct +{ + int syntax; + const char *pattern; + const char *string; + int res; + const char *init_string; +} tests[] = { + /* \xc3\x84 LATIN CAPITAL LETTER A WITH DIAERESIS + \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS + \xc3\xa4 LATIN SMALL LETTER A WITH DIAERESIS + \xc3\xb6 LATIN SMALL LETTER O WITH DIAERESIS + \xe2\x80\x94 EM DASH */ + {BRE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"}, + {BRE, "f*oo", "b\xc3\xa4rfoob\xc3\xa4z", 4, NULL}, + {BRE, "f\\(o\\)o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"}, + {BRE, "f\\(o\\)\\+o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "fo"}, + {BRE, "f\\(o\\)\\?o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"}, + {BRE, "f\\(o\\)*o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"}, + {BRE, "fo*b", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"}, + {BRE, "fo\\+b", "b\xc3\xa4rfoob\xc3\xa4z", 4, "fo"}, + {BRE, "fo\\?b", "b\xc3\xa4rfb\xc3\xa4z", 4, "f"}, + {BRE, "^b\xc3\xa4r", "b\xc3\xa4rfoob\xc3\xa4z", 0, "b\xc3\xa4r"}, + {BRE, "\\(fo\\)o", "b\xc3\xa4rfoob\xc3\xa4z", 4, "foo"}, + {BRE, "f\\(oo\\|bar\\)", "b\xc3\xa4rfoob\xc3\xa4z", 4, "f"}, + {BRE, "\\(fo\\|ba\\)", "b\xc3\xa4rfoob\xc3\xa4z", 4, NULL}, + {BRE, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, "x"}, + {BRE, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, "x\\yyyyyyz"}, + {BRE, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, "x\\yy"}, + {BRE, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, "x\\"}, + {BRE, "x[C]y", "axCy", 1, "x"}, + {BRE, "x[ABC]y", "axCy", 1, "x"}, + {BRE, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, NULL}, + {BRE, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, "xyz"}, + {BRE, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, "x"}, + {BRE, "x.y", "ax\xe2\x80\x94yz", 1, "x"}, + {BRE, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, "x"}, + {BRE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, "x"}, + {BRE, "x[A-Z,]y", "axCy", 1, "x"}, + {BRE, "x[^y]z", "ax\xe2\x80\x94z", 1, "x"}, + {BRE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, "x"}, + {BRE, "x[[=A=]]z", "axAz", 1, "x"}, + {BRE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, "x"}, + {BRE, "\\init_string != NULL) + { + printf ("init_string != NULL in test %zd\n", i); + regfree (®buf); + ret = 1; + continue; + } + + regbuf.fastmap = malloc (256); + re_compile_fastmap (®buf); + if (tests[i].init_string == NULL) + { + if (((re_dfa_t *) regbuf.buffer)->init_string != NULL) + { + printf ("init_string \"%s\" in test %zd, expected NULL\n", + ((re_dfa_t *) regbuf.buffer)->init_string, i); + ret = 1; + } + } + else if (((re_dfa_t *) regbuf.buffer)->init_string == NULL) + { + printf ("init_string NULL in test %zd, expected \"%s\"\n", + i, tests[i].init_string); + ret = 1; + } + else if (strcmp (tests[i].init_string, + ((re_dfa_t *) regbuf.buffer)->init_string)) + { + printf ("init_string \"%s\" in test %zd, expected \"%s\"\n", + ((re_dfa_t *) regbuf.buffer)->init_string, + i, tests[i].init_string); + ret = 1; + } + if (((re_dfa_t *) regbuf.buffer)->init_string != NULL) + { + int j, c = ((re_dfa_t *) regbuf.buffer)->init_string [0]; + + for (j = 0; j < 256; ++j) + if (regbuf.fastmap[j] != (j == c)) + { + printf ("init_string \"%s\" doesn't match fastmap (%02x)\n", + ((re_dfa_t *) regbuf.buffer)->init_string, j); + ret = 1; + break; + } + } + + res = re_search (®buf, tests[i].string, strlen (tests[i].string), 0, + strlen (tests[i].string), NULL); + if (res != tests[i].res) + { + printf ("re_search %zd failed: %d\n", i, res); + ret = 1; + regfree (®buf); + continue; + } + regfree (®buf); + + re_set_syntax (tests[i].syntax | RE_ICASE); + memset (®buf, '\0', sizeof (regbuf)); + regbuf.fastmap = malloc (256); + err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern), + ®buf); + if (err != NULL) + { + printf ("re_compile_pattern failed: %s\n", err); + ret = 1; + continue; + } + + if (((re_dfa_t *) regbuf.buffer)->init_string != NULL) + { + printf ("init_string != NULL in ICASE test %zd\n", i); + regfree (®buf); + ret = 1; + continue; + } + + res = re_search (®buf, tests[i].string, strlen (tests[i].string), 0, + strlen (tests[i].string), NULL); + if (res != tests[i].res) + { + printf ("re_search %zd failed: %d\n", i, res); + ret = 1; + regfree (®buf); + continue; + } + regfree (®buf); + } + + return ret; +} --- libc/posix/Makefile.jj 2003-11-13 22:04:19.000000000 +0100 +++ libc/posix/Makefile 2003-11-14 23:32:09.000000000 +0100 @@ -78,7 +78,7 @@ tests := tstgetopt testfnm runtests run bug-regex8 bug-regex9 bug-regex10 bug-regex11 bug-regex12 \ bug-regex13 bug-regex14 bug-regex15 bug-regex16 \ bug-regex17 bug-regex18 bug-regex19 bug-regex20 \ - tst-nice tst-nanosleep transbug tst-rxspencer + bug-regex21 tst-nice tst-nanosleep transbug tst-rxspencer ifeq (yes,$(build-shared)) test-srcs := globtest tests += wordexp-test tst-exec tst-spawn @@ -159,6 +159,7 @@ bug-regex17-ENV = LOCPATH=$(common-objpf bug-regex18-ENV = LOCPATH=$(common-objpfx)localedata bug-regex19-ENV = LOCPATH=$(common-objpfx)localedata bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata +bug-regex21-ENV = LOCPATH=$(common-objpfx)localedata testcases.h: TESTS TESTS2C.sed sed -f TESTS2C.sed < $< > $@T --- libc/posix/tst-regex.c.jj 2003-11-12 21:33:35.000000000 +0100 +++ libc/posix/tst-regex.c 2003-11-14 23:59:09.000000000 +0100 @@ -121,6 +121,7 @@ main (void) /* Run the actual tests. All tests are run in a single-byte and a multi-byte locale. */ result = test_expr ("[äáàâéèêíìîñöóòôüúùû]", 2, 2); + result |= test_expr ("Uddeborg", 2, 2); result |= test_expr ("G.ran", 2, 3); result |= test_expr ("G.\\{1\\}ran", 2, 3); result |= test_expr ("G.*ran", 3, 44); Jakub