From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 14007 invoked by alias); 26 May 2008 22:10:07 -0000 Received: (qmail 13983 invoked by uid 9697); 26 May 2008 22:10:06 -0000 Date: Mon, 26 May 2008 22:10:00 -0000 Message-ID: <20080526221006.13967.qmail@sourceware.org> From: pmachata@sourceware.org To: frysk-cvs@sourceware.org Subject: [SCM] master: Expression parser: recognize # syntax of qualified symbol name X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 0c0c0865bb2e2097196119f0b8149e24883ed8e4 X-Git-Newrev: 519bd0d1c9f2147a5503d0a0058d6e73558d2de6 Mailing-List: contact frysk-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: frysk-cvs-owner@sourceware.org Reply-To: frysk@sourceware.org X-SW-Source: 2008-q2/txt/msg00292.txt.bz2 The branch, master has been updated via 519bd0d1c9f2147a5503d0a0058d6e73558d2de6 (commit) from 0c0c0865bb2e2097196119f0b8149e24883ed8e4 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email. - Log ----------------------------------------------------------------- commit 519bd0d1c9f2147a5503d0a0058d6e73558d2de6 Author: Petr Machata Date: Tue May 27 00:04:36 2008 +0200 Expression parser: recognize # syntax of qualified symbol name * Because # operator is suffix, we need arbitrary look-ahead to disambiguate a+b.c from a+b.c#symbol. Thus all the parsing is done by hand, instead of relying on antlr, which would go nuts from all the lexical ambiguities. * It's not possible to start a filename with a number. Numbers are allowed only as line reference, e.g. 100#something. * The parser has some intelligence, and is not hungry if possible. For example, #dso#symbol+a is correctly parsed as addition of one qualified and one plain symbol, similarly for #dso#s+#dso2#s2. However for example #dso#symbol.field+#dso2#symbol2 doesn't work, the parser will think that "symbol.field+" is a file name. * Some of these restrictions can be lifted in future, when the set of allowed characters is cut down a bit, but some are inherent in the flawed # syntax. * PLT entries are specified with plt: prefix, e.g. "break *plt:write" (the symbol doesn't have to be qualified). * Symbol versions are specified with @VERSION suffix. String of letters, numbers and underscores is allowed after that. E.g. "p &write@GLIBCXY". ----------------------------------------------------------------------- Summary of changes: frysk-core/frysk/expr/CExpr.g | 194 ++++++++++++++++++++++++++++++++++++++- frysk-core/frysk/expr/ChangeLog | 4 + 2 files changed, 194 insertions(+), 4 deletions(-) First 500 lines of diff: diff --git a/frysk-core/frysk/expr/CExpr.g b/frysk-core/frysk/expr/CExpr.g index dd58ef3..349327a 100644 --- a/frysk-core/frysk/expr/CExpr.g +++ b/frysk-core/frysk/expr/CExpr.g @@ -79,6 +79,9 @@ header // version and license this file solely under the GPL without // exception. package frysk.expr; + + import java.util.regex.Pattern; + import java.util.regex.Matcher; } class CExprParser extends Parser; @@ -418,6 +421,27 @@ tokens OPERATOR = "operator"; } +{ + private String fqinit; + private char fqLA(int i) throws CharStreamException { + if (i >= fqinit.length()) + return LA(i - fqinit.length() + 1); + else + return fqinit.charAt(i); + } + private void fqmatch(String s) throws MismatchedCharException, CharStreamException { + while (fqinit.length() > 0) { + char c = s.charAt(0); + char d = fqinit.charAt(0); + if (c != d) + throw new MismatchedCharException(d, c, false, this); + s = s.substring(1); + fqinit = fqinit.substring(1); + } + super.match(s); + } +} + AMPERSAND : '&' ; AND : "&&" ; ASSIGNEQUAL : '=' ; @@ -471,11 +495,171 @@ TIMESEQUAL : "*=" ; protected ELLIPSIS : "..." ; +/* + * Funky HPD #-syntax doesn't map very well to LL-k type parser (for + * constant 'k'). When written directly, we get lots of lexical + * ambiguities. We work around that by doing arbitrary manual + * look-ahead and just parsing the tokens ourselves. Any whitespace + * or EOF stops the lookahead. + */ + +private +PARSE_FQIDENT + : { + // Automaton state is composed of following sub-states: + final int FILE = 1; + final int LINE = 2; + final int SYMB = 4; + int state = LINE | SYMB; + + String matched = ""; + String part = ""; + + String dso = null; + String file = null; + String proc = null; + String line = null; + + int i = 0; + char c; + if ((c = fqLA(0)) == '#') { + matched += c; + i++; + while (true) { + c = fqLA(i++); + matched += c; + if (Character.isWhitespace(c) || c == EOF_CHAR) + // This is a wack. + throw new RecognitionException("Nonterminated DSO part `" + matched + + "' in fully qualified notation."); + else if (c == '#') + break; + part += c; + } + if (part.length() == 0) + throw new RecognitionException("Empty DSO part `" + matched + + "' in fully qualified notation."); + dso = part; + part = ""; + } + + loop: while(true) { + c = fqLA(i++); + if (Character.isWhitespace(c) || c == EOF_CHAR) + break; + + matched += c; + part += c; + switch (c) { + case '.': { + state |= FILE; + state &= ~SYMB; + break; + } + + case '#': { + if (line == null && proc == null) { + if ((state & FILE) != 0 && file == null) + file = part.substring(0, part.length() - 1); + else if ((state & LINE) != 0) + line = part.substring(0, part.length() - 1); + else if ((state & SYMB) != 0) { + proc = part.substring(0, part.length() - 1); + if (!Character.isJavaIdentifierStart(proc.charAt(0))) + throw new RecognitionException("Procedure part (`" + proc + "') in fully " + + "qualified notation has to be valid identifier."); + } else + // This # could belong to the next symbol. + // Break out and try to match the initial sequence. + break loop; + } else + throw new RecognitionException("Unexpected `#' after line or proc name was defined."); + + state = SYMB; + if (line == null && proc == null) + state |= LINE; + part = ""; + break; + } + + default: { + if (!(c >= '0' && c <= '9')) { + state &= ~LINE; + + if (!(Character.isJavaIdentifierStart(c) + || c == '@' + || (c == ':' && part.equals("plt:")))) { + + // Break out early if we are already + // just waiting for symbol. + if (line != null || proc != null) + break loop; + else + state &= ~SYMB; + } + } + } + } + } + + // ((state & SYMB) == 0) here means that we've parsed more + // than a symbol name, in hope it would turn out to be a + // file name (e.g. hello-world.c#symbol as a symbol + // reference vs. hello-world.c as an expression involving + // subtraction and struct access). In following, we take + // care not to consume anything that's not an identifier. + // E.g. when the user types "a+b", we want to match + // only identifier "a". + + boolean wantPlt = false; + if (part.startsWith("plt:")) { + wantPlt = true; + part = part.substring(4); + } + + int v = part.indexOf('@'); + String version = null; + if (v >= 0) { + version = part.substring(v + 1); + part = part.substring(0, v); + } + + // This is delibaretely simplified and ignores request for initial letter. + // This is for better error reporting below, we first snip off irrelevant + // parts before yelling at user that his identifier sucks. + Matcher m = Pattern.compile("[a-zA-Z0-9_$]*").matcher(part); + if (m.lookingAt()) { + // XXX This accepts also e.g. "plt:something" (i.e. without the "#" part). Ok? + int diff = part.length() - m.end(); + if (diff > 0) { + matched = matched.substring(0, matched.length() - diff); + part = part.substring(0, m.end()); + } + } + + if (!Character.isJavaIdentifierStart(part.charAt(0))) + throw new RecognitionException("Invalid symbol `" + part + "'."); + + if (dso != null) + System.err.println("DSO: " + dso); + if (file != null) + System.err.println("File: " + file); + if (line != null) + System.err.println("Line: " + line); + if (proc != null) + System.err.println("Proc: " + proc); + System.err.println("Symb: " + (wantPlt ? "plt:" : "") + + part + (version != null ? "@" + version : "")); + + // The string MATCHED holds whole fqid expression. Decide + // if it's syntactically correct. + fqmatch(matched); + System.out.println("matched = " + matched); + } ; + protected IDENT -options {testLiterals = true;} - : ('$')*('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')* -; + : ('$'|'#'|'a'..'z'|'A'..'Z'|'_') { fqinit = $getText; } PARSE_FQIDENT ; /** * A token is returned not only on regular tabs @@ -621,7 +805,9 @@ NUM | ('0'..'7')+ {_ttype = OCTALINT;} )? - | ('1'..'9') ('0'..'9')* {_ttype = DECIMALINT;} + | (('1'..'9') ('0'..'9')* {_ttype = DECIMALINT;}) + ( '#' {fqinit = $getText;} + PARSE_FQIDENT { $setType(IDENT); } )? ) ( ('l'|'L') { _ttype = DECIMALINT; } diff --git a/frysk-core/frysk/expr/ChangeLog b/frysk-core/frysk/expr/ChangeLog index ae21bf8..6e97358 100644 --- a/frysk-core/frysk/expr/ChangeLog +++ b/frysk-core/frysk/expr/ChangeLog @@ -1,3 +1,7 @@ +2008-05-26 Petr Machata + + * CExpr.g: Implement #-syntax parser. + 2008-05-20 Sami Wagiaalla * ExprSearchEngine.java: New. hooks/post-receive -- frysk system monitor/debugger