* [PATCH] Optimize UTF-8 [ABC] searches
@ 2003-11-13 20:53 Jakub Jelinek
2003-11-13 21:24 ` Ulrich Drepper
0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2003-11-13 20:53 UTC (permalink / raw)
To: Ulrich Drepper; +Cc: Glibc hackers
Hi!
This optimizes also searches which contain bracket expressions, assuming
they don't contain >= 0x80 chars, negation, char ranges, collating elements,
equivalence classes or char classes.
2003-11-13 Jakub Jelinek <jakub@redhat.com>
* posix/regcomp.c (optimize_utf8): Optimize even if SIMPLE_BRACKET
with no bits set for >= 0x80 chars is seen.
* posix/bug-regex20.c (tests): Add new tests. Expect [ABC] to be
optimized.
(main): Run all tests with RE_ICASE as well.
--- libc/posix/regcomp.c.jj 2003-11-13 10:12:11.000000000 +0100
+++ libc/posix/regcomp.c 2003-11-13 20:47:55.000000000 +0100
@@ -963,7 +963,7 @@ static void
optimize_utf8 (dfa)
re_dfa_t *dfa;
{
- int node;
+ int node, i;
for (node = 0; node < dfa->nodes_len; ++node)
switch (dfa->nodes[node].type)
@@ -998,6 +998,12 @@ optimize_utf8 (dfa)
case OP_OPEN_SUBEXP:
case OP_CLOSE_SUBEXP:
break;
+ case SIMPLE_BRACKET:
+ /* Just double check. */
+ for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
+ if (dfa->nodes[node].opr.sbcset[i])
+ return;
+ break;
default:
return;
}
--- libc/posix/bug-regex20.c.jj 2003-11-12 20:08:34.000000000 +0100
+++ libc/posix/bug-regex20.c 2003-11-13 21:39:19.000000000 +0100
@@ -47,8 +47,8 @@ static struct
{RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
{RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1},
{RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1},
- /* FIXME. This one should be optimizable, but is not ATM. */
- {RE_SYNTAX_POSIX_BASIC, "x[ABC]y", "axCy", 1, 0},
+ {RE_SYNTAX_POSIX_BASIC, "x[C]y", "axCy", 1, 1},
+ {RE_SYNTAX_POSIX_BASIC, "x[ABC]y", "axCy", 1, 1},
{RE_SYNTAX_POSIX_BASIC, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1},
{RE_SYNTAX_POSIX_BASIC, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
{RE_SYNTAX_POSIX_BASIC, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
@@ -57,8 +57,8 @@ static struct
{RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
{RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{2,36}z+", "x\\yzz\xc3\xb6", -1, 1},
{RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{,3}z+", "x\\yyyzz\xc3\xb6", 0, 1},
- /* FIXME. This one should be optimizable, but is not ATM. */
- {RE_SYNTAX_POSIX_EXTENDED, "x[ABC]y", "axCy", 1, 0},
+ {RE_SYNTAX_POSIX_EXTENDED, "x[C]y", "axCy", 1, 1},
+ {RE_SYNTAX_POSIX_EXTENDED, "x[ABC]y", "axCy", 1, 1},
{RE_SYNTAX_POSIX_EXTENDED, "\\`x|z\\'", "x\xe2\x80\x94", 0, 1},
{RE_SYNTAX_POSIX_EXTENDED, "(xy)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
{RE_SYNTAX_POSIX_EXTENDED, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1},
@@ -69,6 +69,8 @@ static struct
{RE_SYNTAX_POSIX_BASIC, "x[A-Z,]y", "axCy", 1, 0},
{RE_SYNTAX_POSIX_BASIC, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
{RE_SYNTAX_POSIX_BASIC, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0},
+ {RE_SYNTAX_POSIX_BASIC, "x[[=A=]]z", "axAz", 1, 0},
+ {RE_SYNTAX_POSIX_BASIC, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0},
{RE_SYNTAX_POSIX_BASIC, "\\<g", "\xe2\x80\x94g", 3, 0},
{RE_SYNTAX_POSIX_BASIC, "\\bg\\b", "\xe2\x80\x94g", 3, 0},
{RE_SYNTAX_POSIX_BASIC, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0},
@@ -80,6 +82,8 @@ static struct
{RE_SYNTAX_POSIX_EXTENDED, "x[A-Z,]y", "axCy", 1, 0},
{RE_SYNTAX_POSIX_EXTENDED, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
{RE_SYNTAX_POSIX_EXTENDED, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0},
+ {RE_SYNTAX_POSIX_EXTENDED, "x[[=A=]]z", "axAz", 1, 0},
+ {RE_SYNTAX_POSIX_EXTENDED, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0},
{RE_SYNTAX_POSIX_EXTENDED, "\\<g", "\xe2\x80\x94g", 3, 0},
{RE_SYNTAX_POSIX_EXTENDED, "\\bg\\b", "\xe2\x80\x94g", 3, 0},
{RE_SYNTAX_POSIX_EXTENDED, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0},
@@ -101,6 +105,7 @@ main (void)
for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
{
int res, optimized;
+
re_set_syntax (tests[i].syntax);
memset (®buf, '\0', sizeof (regbuf));
err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
@@ -131,6 +136,37 @@ main (void)
continue;
}
regfree (®buf);
+
+ re_set_syntax (tests[i].syntax | RE_ICASE);
+ memset (®buf, '\0', sizeof (regbuf));
+ err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
+ ®buf);
+ if (err != NULL)
+ {
+ printf ("re_compile_pattern failed: %s\n", err);
+ ret = 1;
+ continue;
+ }
+
+ /* Check if re_search will be done as multi-byte or single-byte. */
+ optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1;
+ if (optimized)
+ {
+ printf ("pattern %zd optimized while it should not be when case insensitive\n",
+ i);
+ ret = 1;
+ }
+
+ res = re_search (®buf, tests[i].string, strlen (tests[i].string), 0,
+ strlen (tests[i].string), NULL);
+ if (res != tests[i].res)
+ {
+ printf ("re_search %zd failed: %d\n", i, res);
+ ret = 1;
+ regfree (®buf);
+ continue;
+ }
+ regfree (®buf);
}
return ret;
Jakub
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH] Optimize UTF-8 [ABC] searches
2003-11-13 20:53 [PATCH] Optimize UTF-8 [ABC] searches Jakub Jelinek
@ 2003-11-13 21:24 ` Ulrich Drepper
0 siblings, 0 replies; 2+ messages in thread
From: Ulrich Drepper @ 2003-11-13 21:24 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: Glibc hackers
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Jakub Jelinek wrote:
> This optimizes also searches which contain bracket expressions, assuming
> they don't contain >= 0x80 chars, negation, char ranges, collating elements,
> equivalence classes or char classes.
Thanks, I've applied it.
- --
⧠Ulrich Drepper ⧠Red Hat, Inc. ⧠444 Castro St ⧠Mountain View, CA â
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.2 (GNU/Linux)
iD8DBQE/s/Y02ijCOnn/RHQRArCuAKDFfaoM+FcxMFgv3axZZYgoDudglgCfZYBw
9WIuwW3OpueLEHb9RzRcxvE=
=K/+B
-----END PGP SIGNATURE-----
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2003-11-13 21:24 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-11-13 20:53 [PATCH] Optimize UTF-8 [ABC] searches Jakub Jelinek
2003-11-13 21:24 ` Ulrich Drepper
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).