* [4.1] Patch: FYI: another regex patch
@ 2006-02-13 22:58 Tom Tromey
0 siblings, 0 replies; only message in thread
From: Tom Tromey @ 2006-02-13 22:58 UTC (permalink / raw)
To: Java Patch List
I'm checking this in on the 4.1 branch.
Ito fixed another regex bug, one which occurs when doing Eclipse
'hippie completion'.
Tom
Index: ChangeLog.gcj
from Ito Kazumitsu <kaz@maczuka.gcd.org>
Fixes bug #26166
* gnu/regexp/RE.java(initialize): Parsing of character class expression
was moved to a new method parseCharClass.
(parseCharClass): New method originally in initialize. Added parsing
of nested character classes.
(ParseCharClassResult): New inner class used as a return value of
parseCharClass.
(getCharExpression),(getNamedProperty): Made static.
* gnu/regexp/RESyntax.java(RE_NESTED_CHARCLASS): New syntax flag.
* gnu/regexp/RETokenOneOf.java(addition): New Vector for storing
nested character classes.
(RETokenOneOf): New constructor accepting the Vector addition.
(getMinimumLength), (getMaximumLength): Returns 1 if the token
stands for only one character.
(match): Added the processing of the Vector addition.
(matchN), (matchP): Do not check next token if addition is used.
Index: gnu/regexp/RE.java
===================================================================
--- gnu/regexp/RE.java (revision 110936)
+++ gnu/regexp/RE.java (working copy)
@@ -1,5 +1,5 @@
/* gnu/regexp/RE.java
- Copyright (C) 1998-2001, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -427,116 +427,12 @@
// [...] | [^...]
else if ((unit.ch == '[') && !(unit.bk || quot)) {
- Vector options = new Vector();
- boolean negative = false;
- // FIXME: lastChar == 0 means lastChar is not set. But what if
- // \u0000 is used as a meaningful character?
- char lastChar = 0;
- if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
-
- // Check for initial caret, negation
- if ((ch = pattern[index]) == '^') {
- negative = true;
- if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- ch = pattern[index];
- }
-
- // Check for leading right bracket literal
- if (ch == ']') {
- lastChar = ch;
- if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- }
-
- while ((ch = pattern[index++]) != ']') {
- if ((ch == '-') && (lastChar != 0)) {
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- if ((ch = pattern[index]) == ']') {
- options.addElement(new RETokenChar(subIndex,lastChar,insens));
- lastChar = '-';
- } else {
- if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
- CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
- if (ce == null)
- throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
- ch = ce.ch;
- index = index + ce.len - 1;
- }
- options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
- lastChar = 0;
- index++;
- }
- } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- int posixID = -1;
- boolean negate = false;
- // FIXME: asciiEsc == 0 means asciiEsc is not set. But what if
- // \u0000 is used as a meaningful character?
- char asciiEsc = 0;
- NamedProperty np = null;
- if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
- switch (pattern[index]) {
- case 'D':
- negate = true;
- case 'd':
- posixID = RETokenPOSIX.DIGIT;
- break;
- case 'S':
- negate = true;
- case 's':
- posixID = RETokenPOSIX.SPACE;
- break;
- case 'W':
- negate = true;
- case 'w':
- posixID = RETokenPOSIX.ALNUM;
- break;
- }
- }
- if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
- np = getNamedProperty(pattern, index - 1, pLength);
- if (np == null)
- throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
- index = index - 1 + np.len - 1;
- }
- else {
- CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
- if (ce == null)
- throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
- asciiEsc = ce.ch;
- index = index - 1 + ce.len - 1;
- }
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
-
- if (posixID != -1) {
- options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
- } else if (np != null) {
- options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
- } else if (asciiEsc != 0) {
- lastChar = asciiEsc;
- } else {
- lastChar = pattern[index];
- }
- ++index;
- } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
- StringBuffer posixSet = new StringBuffer();
- index = getPosixSet(pattern,index+1,posixSet);
- int posixId = RETokenPOSIX.intValue(posixSet.toString());
- if (posixId != -1)
- options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
- } else {
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
- lastChar = ch;
- }
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- } // while in list
- // Out of list, index is one past ']'
-
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
-
// Create a new RETokenOneOf
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index, pLength, cflags, syntax, 0);
addToken(currentToken);
- options.trimToSize();
- currentToken = new RETokenOneOf(subIndex,options,negative);
+ currentToken = result.token;
+ index = result.index;
}
// SUBEXPRESSIONS
@@ -1088,6 +984,199 @@
}
+ private static class ParseCharClassResult {
+ RETokenOneOf token;
+ int index;
+ boolean returnAtAndOperator = false;
+ }
+
+ /**
+ * Parse [...] or [^...] and make an RETokenOneOf instance.
+ * @param subIndex subIndex to be given to the created RETokenOneOf instance.
+ * @param pattern Input array of characters to be parsed.
+ * @param index Index pointing to the character next to the beginning '['.
+ * @param pLength Limit of the input array.
+ * @param cflags Compilation flags used to parse the pattern.
+ * @param pflags Flags that affect the behavior of this method.
+ * @param syntax Syntax used to parse the pattern.
+ */
+ private static ParseCharClassResult parseCharClass(int subIndex,
+ char[] pattern, int index,
+ int pLength, int cflags, RESyntax syntax, int pflags)
+ throws REException {
+
+ boolean insens = ((cflags & REG_ICASE) > 0);
+ Vector options = new Vector();
+ Vector addition = new Vector();
+ boolean additionAndAppeared = false;
+ final int RETURN_AT_AND = 0x01;
+ boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
+ boolean negative = false;
+ char ch;
+
+ char lastChar = 0;
+ boolean lastCharIsSet = false;
+ if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
+
+ // Check for initial caret, negation
+ if ((ch = pattern[index]) == '^') {
+ negative = true;
+ if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ ch = pattern[index];
+ }
+
+ // Check for leading right bracket literal
+ if (ch == ']') {
+ lastChar = ch; lastCharIsSet = true;
+ if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ }
+
+ while ((ch = pattern[index++]) != ']') {
+ if ((ch == '-') && (lastCharIsSet)) {
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ if ((ch = pattern[index]) == ']') {
+ options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ lastChar = '-';
+ } else {
+ if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ ch = ce.ch;
+ index = index + ce.len - 1;
+ }
+ options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
+ lastChar = 0; lastCharIsSet = false;
+ index++;
+ }
+ } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ int posixID = -1;
+ boolean negate = false;
+ char asciiEsc = 0;
+ boolean asciiEscIsSet = false;
+ NamedProperty np = null;
+ if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
+ switch (pattern[index]) {
+ case 'D':
+ negate = true;
+ case 'd':
+ posixID = RETokenPOSIX.DIGIT;
+ break;
+ case 'S':
+ negate = true;
+ case 's':
+ posixID = RETokenPOSIX.SPACE;
+ break;
+ case 'W':
+ negate = true;
+ case 'w':
+ posixID = RETokenPOSIX.ALNUM;
+ break;
+ }
+ }
+ if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
+ np = getNamedProperty(pattern, index - 1, pLength);
+ if (np == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ index = index - 1 + np.len - 1;
+ }
+ else {
+ CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ asciiEsc = ce.ch; asciiEscIsSet = true;
+ index = index - 1 + ce.len - 1;
+ }
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+
+ if (posixID != -1) {
+ options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
+ } else if (np != null) {
+ options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
+ } else if (asciiEscIsSet) {
+ lastChar = asciiEsc; lastCharIsSet = true;
+ } else {
+ lastChar = pattern[index]; lastCharIsSet = true;
+ }
+ ++index;
+ } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
+ StringBuffer posixSet = new StringBuffer();
+ index = getPosixSet(pattern,index+1,posixSet);
+ int posixId = RETokenPOSIX.intValue(posixSet.toString());
+ if (posixId != -1)
+ options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
+ } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index, pLength, cflags, syntax, 0);
+ addition.addElement(result.token);
+ addition.addElement("|");
+ index = result.index;
+ } else if ((ch == '&') &&
+ (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
+ (index < pLength) && (pattern[index] == '&')) {
+ if (returnAtAndOperator) {
+ ParseCharClassResult result = new ParseCharClassResult();
+ options.trimToSize();
+ if (additionAndAppeared) addition.addElement("&");
+ if (addition.size() == 0) addition = null;
+ result.token = new RETokenOneOf(subIndex,
+ options, addition, negative);
+ result.index = index - 1;
+ result.returnAtAndOperator = true;
+ return result;
+ }
+ // The precedence of the operator "&&" is the lowest.
+ // So we postpone adding "&" until other elements
+ // are added. And we insert Boolean.FALSE at the
+ // beginning of the list of tokens following "&&".
+ // So, "&&[a-b][k-m]" will be stored in the Vecter
+ // addition in this order:
+ // Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
+ if (additionAndAppeared) addition.addElement("&");
+ addition.addElement(Boolean.FALSE);
+ additionAndAppeared = true;
+
+ // The part on which "&&" operates may be either
+ // (1) explicitly enclosed by []
+ // or
+ // (2) not enclosed by [] and terminated by the
+ // next "&&" or the end of the character list.
+ // Let the preceding else if block do the case (1).
+ // We must do something in case of (2).
+ if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index+1, pLength, cflags, syntax,
+ RETURN_AT_AND);
+ addition.addElement(result.token);
+ addition.addElement("|");
+ // If the method returned at the next "&&", it is OK.
+ // Otherwise we have eaten the mark of the end of this
+ // character list "]". In this case we must give back
+ // the end mark.
+ index = (result.returnAtAndOperator ?
+ result.index: result.index - 1);
+ }
+ } else {
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ lastChar = ch; lastCharIsSet = true;
+ }
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ } // while in list
+ // Out of list, index is one past ']'
+
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+
+ ParseCharClassResult result = new ParseCharClassResult();
+ // Create a new RETokenOneOf
+ options.trimToSize();
+ if (additionAndAppeared) addition.addElement("&");
+ if (addition.size() == 0) addition = null;
+ result.token = new RETokenOneOf(subIndex,options, addition, negative);
+ result.index = index;
+ return result;
+ }
+
private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
unit.ch = input[index++];
unit.bk = (unit.ch == '\\'
@@ -1124,7 +1213,7 @@
public String toString() { return expr; }
}
- private CharExpression getCharExpression(char[] input, int pos, int lim,
+ private static CharExpression getCharExpression(char[] input, int pos, int lim,
RESyntax syntax) {
CharExpression ce = new CharExpression();
char c = input[pos];
@@ -1216,7 +1305,7 @@
int len;
}
- private NamedProperty getNamedProperty(char[] input, int pos, int lim) {
+ private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
NamedProperty np = new NamedProperty();
char c = input[pos];
if (c == '\\') {
Index: gnu/regexp/RESyntax.java
===================================================================
--- gnu/regexp/RESyntax.java (revision 110936)
+++ gnu/regexp/RESyntax.java (working copy)
@@ -1,5 +1,5 @@
/* gnu/regexp/RESyntax.java
- Copyright (C) 1998-2002, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -227,8 +227,13 @@
*/
public static final int RE_NAMED_PROPERTY = 30;
- private static final int BIT_TOTAL = 31;
+ /**
+ * Syntax bit. Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4.
+ */
+ public static final int RE_NESTED_CHARCLASS = 31;
+ private static final int BIT_TOTAL = 32;
+
/**
* Predefined syntax.
* Emulates regular expression support in the awk utility.
@@ -461,6 +466,7 @@
// XXX
.set(RE_POSSESSIVE_OPS) // *+,?+,++,{}+
.set(RE_UNICODE_CHAR) // \u1234
+ .set(RE_NESTED_CHARCLASS) // [a-z&&[^p-r]]
.makeFinal();
}
Index: gnu/regexp/RETokenOneOf.java
===================================================================
--- gnu/regexp/RETokenOneOf.java (revision 110936)
+++ gnu/regexp/RETokenOneOf.java (working copy)
@@ -1,5 +1,5 @@
/* gnu/regexp/RETokenOneOf.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -37,11 +37,35 @@
package gnu.regexp;
import java.util.Vector;
+import java.util.Stack;
final class RETokenOneOf extends REToken {
private Vector options;
private boolean negative;
+ private Vector addition;
+ // This Vector addition is used to store nested character classes.
+ // For example, if the original expression is
+ // [2-7a-c[f-k][m-z]&&[^p-v][st]]
+ // the basic part /2-7a-c/ is stored in the Vector options, and
+ // the additional part /[f-k][m-z]&&[^p-v][st]/ is stored in the
+ // Vector addition in the following order (Reverse Polish Notation):
+ // -- The matching result of the basic part is assumed here.
+ // [f-k] -- REToken
+ // "|" -- or
+ // [m-z] -- REToken
+ // "|" -- or
+ // false
+ // [^p-v] -- REToken
+ // "|" -- or
+ // [st] -- REToken
+ // "|" -- or
+ // "&" -- and
+ //
+ // As it is clear from the explanation above, the Vector addition is
+ // effective only when this REToken originates from a character class
+ // expression.
+
// This constructor is used for convenience when we know the set beforehand,
// e.g. \d --> new RETokenOneOf("0123456789",false, ..)
// \D --> new RETokenOneOf("0123456789",true, ..)
@@ -60,7 +84,17 @@
this.negative = negative;
}
+ RETokenOneOf(int subIndex, Vector options, Vector addition, boolean negative) {
+ super(subIndex);
+ this.options = options;
+ this.addition = addition;
+ this.negative = negative;
+ }
+
int getMinimumLength() {
+ // (negative || addition != null) occurs when this token originates from
+ // character class expression.
+ if (negative || addition != null) return 1;
int min = Integer.MAX_VALUE;
int x;
for (int i=0; i < options.size(); i++) {
@@ -70,8 +104,10 @@
return min;
}
-
int getMaximumLength() {
+ // (negative || addition != null) occurs when this token originates from
+ // character class expression.
+ if (negative || addition != null) return 1;
int max = 0;
int x;
for (int i=0; i < options.size(); i++) {
@@ -82,56 +118,106 @@
}
boolean match(CharIndexed input, REMatch mymatch) {
- return negative ? matchN(input, mymatch) : matchP(input, mymatch);
- }
+ REMatch tryMatch;
+ boolean tryOnly;
+ if (addition == null) {
+ tryMatch = mymatch;
+ tryOnly = false;
+ }
+ else {
+ tryMatch = (REMatch) mymatch.clone();
+ tryOnly = true;
+ }
+ boolean b = negative ?
+ matchN(input, tryMatch, tryOnly) :
+ matchP(input, tryMatch, tryOnly);
+ if (addition == null) return b;
- private boolean matchN(CharIndexed input, REMatch mymatch) {
- if (input.charAt(mymatch.index) == CharIndexed.OUT_OF_BOUNDS)
+ Stack stack = new Stack();
+ stack.push(new Boolean(b));
+ for (int i=0; i < addition.size(); i++) {
+ Object obj = addition.elementAt(i);
+ if (obj instanceof REToken) {
+ b = ((REToken)obj).match(input, (REMatch)mymatch.clone());
+ stack.push(new Boolean(b));
+ }
+ else if (obj instanceof Boolean) {
+ stack.push(obj);
+ }
+ else if (obj.equals("|")) {
+ b = ((Boolean)stack.pop()).booleanValue();
+ b = ((Boolean)stack.pop()).booleanValue() || b;
+ stack.push(new Boolean(b));
+ }
+ else if (obj.equals("&")) {
+ b = ((Boolean)stack.pop()).booleanValue();
+ b = ((Boolean)stack.pop()).booleanValue() && b;
+ stack.push(new Boolean(b));
+ }
+ else {
+ throw new RuntimeException("Invalid object found");
+ }
+ }
+ b = ((Boolean)stack.pop()).booleanValue();
+ if (b) {
+ ++mymatch.index;
+ return next(input, mymatch);
+ }
return false;
+ }
- REMatch newMatch = null;
- REMatch last = null;
- REToken tk;
- for (int i=0; i < options.size(); i++) {
+ private boolean matchN(CharIndexed input, REMatch mymatch, boolean tryOnly) {
+ if (input.charAt(mymatch.index) == CharIndexed.OUT_OF_BOUNDS)
+ return false;
+
+ REMatch newMatch = null;
+ REMatch last = null;
+ REToken tk;
+ for (int i=0; i < options.size(); i++) {
tk = (REToken) options.elementAt(i);
REMatch tryMatch = (REMatch) mymatch.clone();
if (tk.match(input, tryMatch)) { // match was successful
return false;
} // is a match
- } // try next option
+ } // try next option
- ++mymatch.index;
- return next(input, mymatch);
- }
+ if (tryOnly) return true;
+ ++mymatch.index;
+ return next(input, mymatch);
+ }
- private boolean matchP(CharIndexed input, REMatch mymatch) {
- REMatch.REMatchList newMatch = new REMatch.REMatchList();
- REToken tk;
- for (int i=0; i < options.size(); i++) {
+ private boolean matchP(CharIndexed input, REMatch mymatch, boolean tryOnly) {
+ REMatch.REMatchList newMatch = new REMatch.REMatchList();
+ REToken tk;
+ for (int i=0; i < options.size(); i++) {
// In order that the backtracking can work,
// each option must be chained to the next token.
// But the chain method has some side effect, so
// we use clones.
tk = (REToken)((REToken) options.elementAt(i)).clone();
- tk.chain(this.next);
- tk.setUncle(this.uncle);
- tk.subIndex = this.subIndex;
+ if (! tryOnly) {
+ tk.chain(this.next);
+ tk.setUncle(this.uncle);
+ tk.subIndex = this.subIndex;
+ }
REMatch tryMatch = (REMatch) mymatch.clone();
if (tk.match(input, tryMatch)) { // match was successful
- newMatch.addTail(tryMatch);
+ if (tryOnly) return true;
+ newMatch.addTail(tryMatch);
} // is a match
- } // try next option
+ } // try next option
+ if (tryOnly) return false;
- if (newMatch.head != null) {
- // set contents of mymatch equal to newMatch
+ if (newMatch.head != null) {
+ // set contents of mymatch equal to newMatch
- // try each one that matched
- mymatch.assignFrom(newMatch.head);
- return true;
- } else {
- return false;
+ // try each one that matched
+ mymatch.assignFrom(newMatch.head);
+ return true;
+ } else {
+ return false;
+ }
}
- }
void dump(StringBuffer os) {
os.append(negative ? "[^" : "(?:");
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2006-02-13 22:58 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-02-13 22:58 [4.1] Patch: FYI: another regex patch Tom Tromey
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).