LVM2/libdm/regex parse_rx.c

public inbox for lvm2-cvs@sourceware.org
help / color / mirror / Atom feed

From: agk@sourceware.org
To: lvm-devel@redhat.com, lvm2-cvs@sourceware.org
Subject: LVM2/libdm/regex parse_rx.c
Date: Tue, 20 Apr 2010 22:31:00 -0000	[thread overview]
Message-ID: <20100420223123.5775.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/lvm2
Module name:	LVM2
Changes by:	agk@sourceware.org	2010-04-20 22:31:22

Modified files:
	libdm/regex    : parse_rx.c 

Log message:
	Add a regex optimisation pass for shared character prefixes.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/LVM2/libdm/regex/parse_rx.c.diff?cvsroot=lvm2&r1=1.3&r2=1.4

--- LVM2/libdm/regex/parse_rx.c	2008/11/03 18:59:59	1.3
+++ LVM2/libdm/regex/parse_rx.c	2010/04/20 22:31:22	1.4
@@ -329,19 +329,182 @@
 	return n;
 }
 
+/*----------------------------------------------------------------*/
+
+/*
+ * The optimiser spots common prefixes on either side of an 'or' node, and
+ * lifts them outside the 'or' with a 'cat'.
+ */
+static unsigned _leftmost_depth(struct rx_node *r)
+{
+	int count = 1;
+
+	while (r->type != CHARSET) {
+		count++;
+		r = r->left;
+	}
+
+	return count;
+}
+
+/*
+ * FIXME: a unique key could be built up as part of the parse, to make the
+ * comparison quick.  Alternatively we could use cons-hashing, and then
+ * this would simply be a pointer comparison.
+ */
+static int _nodes_equal(struct rx_node *l, struct rx_node *r)
+{
+	if (l->type != r->type)
+		return 0;
+
+	switch (l->type) {
+	case CAT:
+	case OR:
+		return _nodes_equal(l->left, r->left) &&
+			_nodes_equal(l->right, r->right);
+
+	case STAR:
+	case PLUS:
+	case QUEST:
+		return _nodes_equal(l->left, r->left);
+
+	case CHARSET:
+		return dm_bitset_equal(l->charset, r->charset);
+	}
+
+	/* NOTREACHED */
+	return_0;
+}
+
+static int _find_leftmost_common(struct rx_node *or,
+                                 struct rx_node **l,
+                                 struct rx_node **r)
+{
+	struct rx_node *left = or->left, *right = or->right;
+	unsigned left_depth = _leftmost_depth(left);
+	unsigned right_depth = _leftmost_depth(right);
+
+	while (left_depth > right_depth) {
+		left = left->left;
+		left_depth--;
+	}
+
+	while (right_depth > left_depth) {
+		right = right->left;
+		right_depth--;
+	}
+
+	while (left_depth) {
+		if (left->type == CAT && right->type == CAT) {
+			if (_nodes_equal(left->left, right->left)) {
+				*l = left;
+				*r = right;
+				return 1;
+			}
+		}
+		left = left->left;
+		right = right->left;
+		left_depth--;
+	}
+
+	return 0;
+}
+
+static struct rx_node *_pass(struct dm_pool *mem,
+                             struct rx_node *r,
+                             int *changed)
+{
+	/*
+	 * walk the tree, optimising every 'or' node.
+	 */
+	switch (r->type) {
+	case CAT:
+		if (!(r->left = _pass(mem, r->left, changed)))
+			return_NULL;
+
+		if (!(r->right = _pass(mem, r->right, changed)))
+			return_NULL;
+
+		break;
+
+	case STAR:
+	case PLUS:
+	case QUEST:
+		if (!(r->left = _pass(mem, r->left, changed)))
+			return_NULL;
+		break;
+
+	case OR:
+		/* It's important we optimise sub nodes first */
+		if (!(r->left = _pass(mem, r->left, changed)))
+			return_NULL;
+
+		if (!(r->right = _pass(mem, r->right, changed)))
+			return_NULL;
+
+		{
+			struct rx_node *left, *right;
+
+			if (_find_leftmost_common(r, &left, &right)) {
+				struct rx_node *new_r = _node(mem, CAT, left->left, r);
+
+				if (!new_r)
+					return_NULL;
+
+				memcpy(left, left->right, sizeof(*left));
+				memcpy(right, right->right, sizeof(*right));
+
+				r = new_r;
+
+				*changed = 1;
+			}
+		}
+		break;
+
+	case CHARSET:
+		break;
+	}
+
+	return r;
+}
+
+static struct rx_node *_optimise(struct dm_pool *mem, struct rx_node *r)
+{
+	/*
+	 * We're looking for (or (... (cat <foo> a)) (... (cat <foo> b)))
+	 * and want to turn it into (cat <foo> (or (... a) (... b)))
+	 */
+
+	/*
+	 * Initially done as an inefficient multipass algorithm.
+	 */
+	int changed;
+
+	do {
+		changed = 0;
+		r = _pass(mem, r, &changed);
+	} while (r && changed);
+
+	return r;
+}
+
+/*----------------------------------------------------------------*/
+
 struct rx_node *rx_parse_tok(struct dm_pool *mem,
 			     const char *begin, const char *end)
 {
 	struct rx_node *r;
 	struct parse_sp *ps = dm_pool_zalloc(mem, sizeof(*ps));
 
-	if (!ps) {
-		stack;
-		return NULL;
-	}
+	if (!ps)
+		return_NULL;
 
 	ps->mem = mem;
-	ps->charset = dm_bitset_create(mem, 256);
+	if (!(ps->charset = dm_bitset_create(mem, 256))) {
+		log_error("Regex charset allocation failed");
+		dm_pool_free(mem, ps);
+		return NULL;
+	}
 	ps->cursor = begin;
 	ps->rx_end = end;
 	_rx_get_token(ps);		/* load the first token */
@@ -349,6 +512,13 @@
 	if (!(r = _or_term(ps))) {
 		log_error("Parse error in regex");
 		dm_pool_free(mem, ps);
+		return NULL;
+	}
+
+	if (!(r = _optimise(mem, r))) {
+		log_error("Regex optimisation error");
+		dm_pool_free(mem, ps);
+		return NULL;
 	}
 
 	return r;

next             reply	other threads:[~2010-04-20 22:31 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-20 22:31 agk [this message]
2010-04-22  3:24 agk
2010-04-22 13:18 agk
2010-04-22 13:42 agk
2010-04-22 14:33 agk
2010-04-22 20:15 agk
2010-08-09 10:29 thornber
2010-11-02 20:10 agk

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100420223123.5775.qmail@sourceware.org \
    --to=agk@sourceware.org \
    --cc=lvm-devel@redhat.com \
    --cc=lvm2-cvs@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).