public inbox for frysk-cvs@sourceware.org help / color / mirror / Atom feed
From: pmachata@sourceware.org To: frysk-cvs@sourceware.org Subject: [SCM] master: Expression parser: recognize # syntax of qualified symbol name Date: Mon, 26 May 2008 22:10:00 -0000 [thread overview] Message-ID: <20080526221006.13967.qmail@sourceware.org> (raw) The branch, master has been updated via 519bd0d1c9f2147a5503d0a0058d6e73558d2de6 (commit) from 0c0c0865bb2e2097196119f0b8149e24883ed8e4 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email. - Log ----------------------------------------------------------------- commit 519bd0d1c9f2147a5503d0a0058d6e73558d2de6 Author: Petr Machata <pmachata@redhat.com> Date: Tue May 27 00:04:36 2008 +0200 Expression parser: recognize # syntax of qualified symbol name * Because # operator is suffix, we need arbitrary look-ahead to disambiguate a+b.c from a+b.c#symbol. Thus all the parsing is done by hand, instead of relying on antlr, which would go nuts from all the lexical ambiguities. * It's not possible to start a filename with a number. Numbers are allowed only as line reference, e.g. 100#something. * The parser has some intelligence, and is not hungry if possible. For example, #dso#symbol+a is correctly parsed as addition of one qualified and one plain symbol, similarly for #dso#s+#dso2#s2. However for example #dso#symbol.field+#dso2#symbol2 doesn't work, the parser will think that "symbol.field+" is a file name. * Some of these restrictions can be lifted in future, when the set of allowed characters is cut down a bit, but some are inherent in the flawed # syntax. * PLT entries are specified with plt: prefix, e.g. "break *plt:write" (the symbol doesn't have to be qualified). * Symbol versions are specified with @VERSION suffix. String of letters, numbers and underscores is allowed after that. E.g. "p &write@GLIBCXY". ----------------------------------------------------------------------- Summary of changes: frysk-core/frysk/expr/CExpr.g | 194 ++++++++++++++++++++++++++++++++++++++- frysk-core/frysk/expr/ChangeLog | 4 + 2 files changed, 194 insertions(+), 4 deletions(-) First 500 lines of diff: diff --git a/frysk-core/frysk/expr/CExpr.g b/frysk-core/frysk/expr/CExpr.g index dd58ef3..349327a 100644 --- a/frysk-core/frysk/expr/CExpr.g +++ b/frysk-core/frysk/expr/CExpr.g @@ -79,6 +79,9 @@ header // version and license this file solely under the GPL without // exception. package frysk.expr; + + import java.util.regex.Pattern; + import java.util.regex.Matcher; } class CExprParser extends Parser; @@ -418,6 +421,27 @@ tokens OPERATOR = "operator"; } +{ + private String fqinit; + private char fqLA(int i) throws CharStreamException { + if (i >= fqinit.length()) + return LA(i - fqinit.length() + 1); + else + return fqinit.charAt(i); + } + private void fqmatch(String s) throws MismatchedCharException, CharStreamException { + while (fqinit.length() > 0) { + char c = s.charAt(0); + char d = fqinit.charAt(0); + if (c != d) + throw new MismatchedCharException(d, c, false, this); + s = s.substring(1); + fqinit = fqinit.substring(1); + } + super.match(s); + } +} + AMPERSAND : '&' ; AND : "&&" ; ASSIGNEQUAL : '=' ; @@ -471,11 +495,171 @@ TIMESEQUAL : "*=" ; protected ELLIPSIS : "..." ; +/* + * Funky HPD #-syntax doesn't map very well to LL-k type parser (for + * constant 'k'). When written directly, we get lots of lexical + * ambiguities. We work around that by doing arbitrary manual + * look-ahead and just parsing the tokens ourselves. Any whitespace + * or EOF stops the lookahead. + */ + +private +PARSE_FQIDENT + : { + // Automaton state is composed of following sub-states: + final int FILE = 1; + final int LINE = 2; + final int SYMB = 4; + int state = LINE | SYMB; + + String matched = ""; + String part = ""; + + String dso = null; + String file = null; + String proc = null; + String line = null; + + int i = 0; + char c; + if ((c = fqLA(0)) == '#') { + matched += c; + i++; + while (true) { + c = fqLA(i++); + matched += c; + if (Character.isWhitespace(c) || c == EOF_CHAR) + // This is a wack. + throw new RecognitionException("Nonterminated DSO part `" + matched + + "' in fully qualified notation."); + else if (c == '#') + break; + part += c; + } + if (part.length() == 0) + throw new RecognitionException("Empty DSO part `" + matched + + "' in fully qualified notation."); + dso = part; + part = ""; + } + + loop: while(true) { + c = fqLA(i++); + if (Character.isWhitespace(c) || c == EOF_CHAR) + break; + + matched += c; + part += c; + switch (c) { + case '.': { + state |= FILE; + state &= ~SYMB; + break; + } + + case '#': { + if (line == null && proc == null) { + if ((state & FILE) != 0 && file == null) + file = part.substring(0, part.length() - 1); + else if ((state & LINE) != 0) + line = part.substring(0, part.length() - 1); + else if ((state & SYMB) != 0) { + proc = part.substring(0, part.length() - 1); + if (!Character.isJavaIdentifierStart(proc.charAt(0))) + throw new RecognitionException("Procedure part (`" + proc + "') in fully " + + "qualified notation has to be valid identifier."); + } else + // This # could belong to the next symbol. + // Break out and try to match the initial sequence. + break loop; + } else + throw new RecognitionException("Unexpected `#' after line or proc name was defined."); + + state = SYMB; + if (line == null && proc == null) + state |= LINE; + part = ""; + break; + } + + default: { + if (!(c >= '0' && c <= '9')) { + state &= ~LINE; + + if (!(Character.isJavaIdentifierStart(c) + || c == '@' + || (c == ':' && part.equals("plt:")))) { + + // Break out early if we are already + // just waiting for symbol. + if (line != null || proc != null) + break loop; + else + state &= ~SYMB; + } + } + } + } + } + + // ((state & SYMB) == 0) here means that we've parsed more + // than a symbol name, in hope it would turn out to be a + // file name (e.g. hello-world.c#symbol as a symbol + // reference vs. hello-world.c as an expression involving + // subtraction and struct access). In following, we take + // care not to consume anything that's not an identifier. + // E.g. when the user types "a+b", we want to match + // only identifier "a". + + boolean wantPlt = false; + if (part.startsWith("plt:")) { + wantPlt = true; + part = part.substring(4); + } + + int v = part.indexOf('@'); + String version = null; + if (v >= 0) { + version = part.substring(v + 1); + part = part.substring(0, v); + } + + // This is delibaretely simplified and ignores request for initial letter. + // This is for better error reporting below, we first snip off irrelevant + // parts before yelling at user that his identifier sucks. + Matcher m = Pattern.compile("[a-zA-Z0-9_$]*").matcher(part); + if (m.lookingAt()) { + // XXX This accepts also e.g. "plt:something" (i.e. without the "#" part). Ok? + int diff = part.length() - m.end(); + if (diff > 0) { + matched = matched.substring(0, matched.length() - diff); + part = part.substring(0, m.end()); + } + } + + if (!Character.isJavaIdentifierStart(part.charAt(0))) + throw new RecognitionException("Invalid symbol `" + part + "'."); + + if (dso != null) + System.err.println("DSO: " + dso); + if (file != null) + System.err.println("File: " + file); + if (line != null) + System.err.println("Line: " + line); + if (proc != null) + System.err.println("Proc: " + proc); + System.err.println("Symb: " + (wantPlt ? "plt:" : "") + + part + (version != null ? "@" + version : "")); + + // The string MATCHED holds whole fqid expression. Decide + // if it's syntactically correct. + fqmatch(matched); + System.out.println("matched = " + matched); + } ; + protected IDENT -options {testLiterals = true;} - : ('$')*('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')* -; + : ('$'|'#'|'a'..'z'|'A'..'Z'|'_') { fqinit = $getText; } PARSE_FQIDENT ; /** * A <TAB> token is returned not only on regular tabs @@ -621,7 +805,9 @@ NUM | ('0'..'7')+ {_ttype = OCTALINT;} )? - | ('1'..'9') ('0'..'9')* {_ttype = DECIMALINT;} + | (('1'..'9') ('0'..'9')* {_ttype = DECIMALINT;}) + ( '#' {fqinit = $getText;} + PARSE_FQIDENT { $setType(IDENT); } )? ) ( ('l'|'L') { _ttype = DECIMALINT; } diff --git a/frysk-core/frysk/expr/ChangeLog b/frysk-core/frysk/expr/ChangeLog index ae21bf8..6e97358 100644 --- a/frysk-core/frysk/expr/ChangeLog +++ b/frysk-core/frysk/expr/ChangeLog @@ -1,3 +1,7 @@ +2008-05-26 Petr Machata <pmachata@redhat.com> + + * CExpr.g: Implement #-syntax parser. + 2008-05-20 Sami Wagiaalla <swagiaal@redhat.com> * ExprSearchEngine.java: New. hooks/post-receive -- frysk system monitor/debugger
reply other threads:[~2008-05-26 22:10 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20080526221006.13967.qmail@sourceware.org \ --to=pmachata@sourceware.org \ --cc=frysk-cvs@sourceware.org \ --cc=frysk@sourceware.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).