/*
* Copyright (c) 1995, 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package java.io;
import java.util.Arrays;
/**
* The StreamTokenizer
class takes an input stream and
* parses it into "tokens", allowing the tokens to be
* read one at a time. The parsing process is controlled by a table
* and a number of flags that can be set to various states. The
* stream tokenizer can recognize identifiers, numbers, quoted
* strings, and various comment styles.
*
* Each byte read from the input stream is regarded as a character
* in the range '\u0000'
through '\u00FF'
.
* The character value is used to look up five possible attributes of
* the character: white space, alphabetic,
* numeric, string quote, and comment character.
* Each character can have zero or more of these attributes.
*
* In addition, an instance has four flags. These flags indicate: *
* A typical application first constructs an instance of this class,
* sets up the syntax tables, and then repeatedly loops calling the
* nextToken
method in each iteration of the loop until
* it returns the value TT_EOF
.
*
* @author James Gosling
* @see java.io.StreamTokenizer#nextToken()
* @see java.io.StreamTokenizer#TT_EOF
* @since JDK1.0
*/
public class StreamTokenizer {
/* Only one of these will be non-null */
private Reader reader = null;
private InputStream input = null;
private char buf[] = new char[20];
/**
* The next character to be considered by the nextToken method. May also
* be NEED_CHAR to indicate that a new character should be read, or SKIP_LF
* to indicate that a new character should be read and, if it is a '\n'
* character, it should be discarded and a second new character should be
* read.
*/
private int peekc = NEED_CHAR;
private static final int NEED_CHAR = Integer.MAX_VALUE;
private static final int SKIP_LF = Integer.MAX_VALUE - 1;
private boolean pushedBack;
private boolean forceLower;
/** The line number of the last token read */
private int LINENO = 1;
private boolean eolIsSignificantP = false;
private boolean slashSlashCommentsP = false;
private boolean slashStarCommentsP = false;
private byte ctype[] = new byte[256];
private static final byte CT_WHITESPACE = 1;
private static final byte CT_DIGIT = 2;
private static final byte CT_ALPHA = 4;
private static final byte CT_QUOTE = 8;
private static final byte CT_COMMENT = 16;
/**
* After a call to the nextToken
method, this field
* contains the type of the token just read. For a single character
* token, its value is the single character, converted to an integer.
* For a quoted string token, its value is the quote character.
* Otherwise, its value is one of the following:
*
TT_WORD
indicates that the token is a word.
* TT_NUMBER
indicates that the token is a number.
* TT_EOL
indicates that the end of line has been read.
* The field can only have this value if the
* eolIsSignificant
method has been called with the
* argument true
.
* TT_EOF
indicates that the end of the input stream
* has been reached.
* * The initial value of this field is -4. * * @see java.io.StreamTokenizer#eolIsSignificant(boolean) * @see java.io.StreamTokenizer#nextToken() * @see java.io.StreamTokenizer#quoteChar(int) * @see java.io.StreamTokenizer#TT_EOF * @see java.io.StreamTokenizer#TT_EOL * @see java.io.StreamTokenizer#TT_NUMBER * @see java.io.StreamTokenizer#TT_WORD */ public int ttype = TT_NOTHING; /** * A constant indicating that the end of the stream has been read. */ public static final int TT_EOF = -1; /** * A constant indicating that the end of the line has been read. */ public static final int TT_EOL = '\n'; /** * A constant indicating that a number token has been read. */ public static final int TT_NUMBER = -2; /** * A constant indicating that a word token has been read. */ public static final int TT_WORD = -3; /* A constant indicating that no token has been read, used for * initializing ttype. FIXME This could be made public and * made available as the part of the API in a future release. */ private static final int TT_NOTHING = -4; /** * If the current token is a word token, this field contains a * string giving the characters of the word token. When the current * token is a quoted string token, this field contains the body of * the string. *
* The current token is a word when the value of the
* ttype
field is TT_WORD
. The current token is
* a quoted string token when the value of the ttype
field is
* a quote character.
*
* The initial value of this field is null.
*
* @see java.io.StreamTokenizer#quoteChar(int)
* @see java.io.StreamTokenizer#TT_WORD
* @see java.io.StreamTokenizer#ttype
*/
public String sval;
/**
* If the current token is a number, this field contains the value
* of that number. The current token is a number when the value of
* the ttype
field is TT_NUMBER
.
*
* The initial value of this field is 0.0. * * @see java.io.StreamTokenizer#TT_NUMBER * @see java.io.StreamTokenizer#ttype */ public double nval; /** Private constructor that initializes everything except the streams. */ private StreamTokenizer() { wordChars('a', 'z'); wordChars('A', 'Z'); wordChars(128 + 32, 255); whitespaceChars(0, ' '); commentChar('/'); quoteChar('"'); quoteChar('\''); parseNumbers(); } /** * Creates a stream tokenizer that parses the specified input * stream. The stream tokenizer is initialized to the following * default state: *
'A'
through 'Z'
,
* 'a'
through 'z'
, and
* '\u00A0'
through '\u00FF'
are
* considered to be alphabetic.
* '\u0000'
through
* '\u0020'
are considered to be white space.
* '/'
is a comment character.
* '\''
and double quote '"'
* are string quote characters.
* * * @param is an input stream. * @see java.io.BufferedReader * @see java.io.InputStreamReader * @see java.io.StreamTokenizer#StreamTokenizer(java.io.Reader) */ @Deprecated public StreamTokenizer(InputStream is) { this(); if (is == null) { throw new NullPointerException(); } input = is; } /** * Create a tokenizer that parses the given character stream. * * @param r a Reader object providing the input stream. * @since JDK1.1 */ public StreamTokenizer(Reader r) { this(); if (r == null) { throw new NullPointerException(); } reader = r; } /** * Resets this tokenizer's syntax table so that all characters are * "ordinary." See the* Reader r = new BufferedReader(new InputStreamReader(is)); * StreamTokenizer st = new StreamTokenizer(r); *
ordinaryChar
method
* for more information on a character being ordinary.
*
* @see java.io.StreamTokenizer#ordinaryChar(int)
*/
public void resetSyntax() {
for (int i = ctype.length; --i >= 0;)
ctype[i] = 0;
}
/**
* Specifies that all characters c in the range
* low <= c <= high
* are word constituents. A word token consists of a word constituent
* followed by zero or more word constituents or number constituents.
*
* @param low the low end of the range.
* @param hi the high end of the range.
*/
public void wordChars(int low, int hi) {
if (low < 0)
low = 0;
if (hi >= ctype.length)
hi = ctype.length - 1;
while (low <= hi)
ctype[low++] |= CT_ALPHA;
}
/**
* Specifies that all characters c in the range
* low <= c <= high
* are white space characters. White space characters serve only to
* separate tokens in the input stream.
*
* Any other attribute settings for the characters in the specified
* range are cleared.
*
* @param low the low end of the range.
* @param hi the high end of the range.
*/
public void whitespaceChars(int low, int hi) {
if (low < 0)
low = 0;
if (hi >= ctype.length)
hi = ctype.length - 1;
while (low <= hi)
ctype[low++] = CT_WHITESPACE;
}
/**
* Specifies that all characters c in the range
* low <= c <= high
* are "ordinary" in this tokenizer. See the
* ordinaryChar
method for more information on a
* character being ordinary.
*
* @param low the low end of the range.
* @param hi the high end of the range.
* @see java.io.StreamTokenizer#ordinaryChar(int)
*/
public void ordinaryChars(int low, int hi) {
if (low < 0)
low = 0;
if (hi >= ctype.length)
hi = ctype.length - 1;
while (low <= hi)
ctype[low++] = 0;
}
/**
* Specifies that the character argument is "ordinary"
* in this tokenizer. It removes any special significance the
* character has as a comment character, word component, string
* delimiter, white space, or number character. When such a character
* is encountered by the parser, the parser treats it as a
* single-character token and sets ttype
field to the
* character value.
*
*
Making a line terminator character "ordinary" may interfere
* with the ability of a StreamTokenizer
to count
* lines. The lineno
method may no longer reflect
* the presence of such terminator characters in its line count.
*
* @param ch the character.
* @see java.io.StreamTokenizer#ttype
*/
public void ordinaryChar(int ch) {
if (ch >= 0 && ch < ctype.length)
ctype[ch] = 0;
}
/**
* Specified that the character argument starts a single-line
* comment. All characters from the comment character to the end of
* the line are ignored by this stream tokenizer.
*
*
Any other attribute settings for the specified character are cleared. * * @param ch the character. */ public void commentChar(int ch) { if (ch >= 0 && ch < ctype.length) ctype[ch] = CT_COMMENT; } /** * Specifies that matching pairs of this character delimit string * constants in this tokenizer. *
* When the nextToken
method encounters a string
* constant, the ttype
field is set to the string
* delimiter and the sval
field is set to the body of
* the string.
*
* If a string quote character is encountered, then a string is
* recognized, consisting of all characters after (but not including)
* the string quote character, up to (but not including) the next
* occurrence of that same string quote character, or a line
* terminator, or end of file. The usual escape sequences such as
* "\n"
and "\t"
are recognized and
* converted to single characters as the string is parsed.
*
*
Any other attribute settings for the specified character are cleared. * * @param ch the character. * @see java.io.StreamTokenizer#nextToken() * @see java.io.StreamTokenizer#sval * @see java.io.StreamTokenizer#ttype */ public void quoteChar(int ch) { if (ch >= 0 && ch < ctype.length) ctype[ch] = CT_QUOTE; } /** * Specifies that numbers should be parsed by this tokenizer. The * syntax table of this tokenizer is modified so that each of the twelve * characters: *
** 0 1 2 3 4 5 6 7 8 9 . - *
* has the "numeric" attribute. *
* When the parser encounters a word token that has the format of a
* double precision floating-point number, it treats the token as a
* number rather than a word, by setting the ttype
* field to the value TT_NUMBER
and putting the numeric
* value of the token into the nval
field.
*
* @see java.io.StreamTokenizer#nval
* @see java.io.StreamTokenizer#TT_NUMBER
* @see java.io.StreamTokenizer#ttype
*/
public void parseNumbers() {
for (int i = '0'; i <= '9'; i++)
ctype[i] |= CT_DIGIT;
ctype['.'] |= CT_DIGIT;
ctype['-'] |= CT_DIGIT;
}
/**
* Determines whether or not ends of line are treated as tokens.
* If the flag argument is true, this tokenizer treats end of lines
* as tokens; the nextToken
method returns
* TT_EOL
and also sets the ttype
field to
* this value when an end of line is read.
*
* A line is a sequence of characters ending with either a
* carriage-return character ('\r'
) or a newline
* character ('\n'
). In addition, a carriage-return
* character followed immediately by a newline character is treated
* as a single end-of-line token.
*
* If the flag
is false, end-of-line characters are
* treated as white space and serve only to separate tokens.
*
* @param flag true
indicates that end-of-line characters
* are separate tokens; false
indicates that
* end-of-line characters are white space.
* @see java.io.StreamTokenizer#nextToken()
* @see java.io.StreamTokenizer#ttype
* @see java.io.StreamTokenizer#TT_EOL
*/
public void eolIsSignificant(boolean flag) {
eolIsSignificantP = flag;
}
/**
* Determines whether or not the tokenizer recognizes C-style comments.
* If the flag argument is true
, this stream tokenizer
* recognizes C-style comments. All text between successive
* occurrences of /*
and */
are discarded.
*
* If the flag argument is false
, then C-style comments
* are not treated specially.
*
* @param flag true
indicates to recognize and ignore
* C-style comments.
*/
public void slashStarComments(boolean flag) {
slashStarCommentsP = flag;
}
/**
* Determines whether or not the tokenizer recognizes C++-style comments.
* If the flag argument is true
, this stream tokenizer
* recognizes C++-style comments. Any occurrence of two consecutive
* slash characters ('/'
) is treated as the beginning of
* a comment that extends to the end of the line.
*
* If the flag argument is false
, then C++-style
* comments are not treated specially.
*
* @param flag true
indicates to recognize and ignore
* C++-style comments.
*/
public void slashSlashComments(boolean flag) {
slashSlashCommentsP = flag;
}
/**
* Determines whether or not word token are automatically lowercased.
* If the flag argument is true
, then the value in the
* sval
field is lowercased whenever a word token is
* returned (the ttype
field has the
* value TT_WORD
by the nextToken
method
* of this tokenizer.
*
* If the flag argument is false
, then the
* sval
field is not modified.
*
* @param fl true
indicates that all word tokens should
* be lowercased.
* @see java.io.StreamTokenizer#nextToken()
* @see java.io.StreamTokenizer#ttype
* @see java.io.StreamTokenizer#TT_WORD
*/
public void lowerCaseMode(boolean fl) {
forceLower = fl;
}
/** Read the next character */
private int read() throws IOException {
if (reader != null)
return reader.read();
else if (input != null)
return input.read();
else
throw new IllegalStateException();
}
/**
* Parses the next token from the input stream of this tokenizer.
* The type of the next token is returned in the ttype
* field. Additional information about the token may be in the
* nval
field or the sval
field of this
* tokenizer.
*
* Typical clients of this
* class first set up the syntax tables and then sit in a loop
* calling nextToken to parse successive tokens until TT_EOF
* is returned.
*
* @return the value of the ttype
field.
* @exception IOException if an I/O error occurs.
* @see java.io.StreamTokenizer#nval
* @see java.io.StreamTokenizer#sval
* @see java.io.StreamTokenizer#ttype
*/
public int nextToken() throws IOException {
if (pushedBack) {
pushedBack = false;
return ttype;
}
byte ct[] = ctype;
sval = null;
int c = peekc;
if (c < 0)
c = NEED_CHAR;
if (c == SKIP_LF) {
c = read();
if (c < 0)
return ttype = TT_EOF;
if (c == '\n')
c = NEED_CHAR;
}
if (c == NEED_CHAR) {
c = read();
if (c < 0)
return ttype = TT_EOF;
}
ttype = c; /* Just to be safe */
/* Set peekc so that the next invocation of nextToken will read
* another character unless peekc is reset in this invocation
*/
peekc = NEED_CHAR;
int ctype = c < 256 ? ct[c] : CT_ALPHA;
while ((ctype & CT_WHITESPACE) != 0) {
if (c == '\r') {
LINENO++;
if (eolIsSignificantP) {
peekc = SKIP_LF;
return ttype = TT_EOL;
}
c = read();
if (c == '\n')
c = read();
} else {
if (c == '\n') {
LINENO++;
if (eolIsSignificantP) {
return ttype = TT_EOL;
}
}
c = read();
}
if (c < 0)
return ttype = TT_EOF;
ctype = c < 256 ? ct[c] : CT_ALPHA;
}
if ((ctype & CT_DIGIT) != 0) {
boolean neg = false;
if (c == '-') {
c = read();
if (c != '.' && (c < '0' || c > '9')) {
peekc = c;
return ttype = '-';
}
neg = true;
}
double v = 0;
int decexp = 0;
int seendot = 0;
while (true) {
if (c == '.' && seendot == 0)
seendot = 1;
else if ('0' <= c && c <= '9') {
v = v * 10 + (c - '0');
decexp += seendot;
} else
break;
c = read();
}
peekc = c;
if (decexp != 0) {
double denom = 10;
decexp--;
while (decexp > 0) {
denom *= 10;
decexp--;
}
/* Do one division of a likely-to-be-more-accurate number */
v = v / denom;
}
nval = neg ? -v : v;
return ttype = TT_NUMBER;
}
if ((ctype & CT_ALPHA) != 0) {
int i = 0;
do {
if (i >= buf.length) {
buf = Arrays.copyOf(buf, buf.length * 2);
}
buf[i++] = (char) c;
c = read();
ctype = c < 0 ? CT_WHITESPACE : c < 256 ? ct[c] : CT_ALPHA;
} while ((ctype & (CT_ALPHA | CT_DIGIT)) != 0);
peekc = c;
sval = String.copyValueOf(buf, 0, i);
if (forceLower)
sval = sval.toLowerCase();
return ttype = TT_WORD;
}
if ((ctype & CT_QUOTE) != 0) {
ttype = c;
int i = 0;
/* Invariants (because \Octal needs a lookahead):
* (i) c contains char value
* (ii) d contains the lookahead
*/
int d = read();
while (d >= 0 && d != ttype && d != '\n' && d != '\r') {
if (d == '\\') {
c = read();
int first = c; /* To allow \377, but not \477 */
if (c >= '0' && c <= '7') {
c = c - '0';
int c2 = read();
if ('0' <= c2 && c2 <= '7') {
c = (c << 3) + (c2 - '0');
c2 = read();
if ('0' <= c2 && c2 <= '7' && first <= '3') {
c = (c << 3) + (c2 - '0');
d = read();
} else
d = c2;
} else
d = c2;
} else {
switch (c) {
case 'a':
c = 0x7;
break;
case 'b':
c = '\b';
break;
case 'f':
c = 0xC;
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case 'v':
c = 0xB;
break;
}
d = read();
}
} else {
c = d;
d = read();
}
if (i >= buf.length) {
buf = Arrays.copyOf(buf, buf.length * 2);
}
buf[i++] = (char)c;
}
/* If we broke out of the loop because we found a matching quote
* character then arrange to read a new character next time
* around; otherwise, save the character.
*/
peekc = (d == ttype) ? NEED_CHAR : d;
sval = String.copyValueOf(buf, 0, i);
return ttype;
}
if (c == '/' && (slashSlashCommentsP || slashStarCommentsP)) {
c = read();
if (c == '*' && slashStarCommentsP) {
int prevc = 0;
while ((c = read()) != '/' || prevc != '*') {
if (c == '\r') {
LINENO++;
c = read();
if (c == '\n') {
c = read();
}
} else {
if (c == '\n') {
LINENO++;
c = read();
}
}
if (c < 0)
return ttype = TT_EOF;
prevc = c;
}
return nextToken();
} else if (c == '/' && slashSlashCommentsP) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
} else {
/* Now see if it is still a single line comment */
if ((ct['/'] & CT_COMMENT) != 0) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
} else {
peekc = c;
return ttype = '/';
}
}
}
if ((ctype & CT_COMMENT) != 0) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
}
return ttype = c;
}
/**
* Causes the next call to the nextToken
method of this
* tokenizer to return the current value in the ttype
* field, and not to modify the value in the nval
or
* sval
field.
*
* @see java.io.StreamTokenizer#nextToken()
* @see java.io.StreamTokenizer#nval
* @see java.io.StreamTokenizer#sval
* @see java.io.StreamTokenizer#ttype
*/
public void pushBack() {
if (ttype != TT_NOTHING) /* No-op if nextToken() not called */
pushedBack = true;
}
/**
* Return the current line number.
*
* @return the current line number of this stream tokenizer.
*/
public int lineno() {
return LINENO;
}
/**
* Returns the string representation of the current stream token and
* the line number it occurs on.
*
*
The precise string returned is unspecified, although the following * example can be considered typical: * *
* * @return a string representation of the token * @see java.io.StreamTokenizer#nval * @see java.io.StreamTokenizer#sval * @see java.io.StreamTokenizer#ttype */ public String toString() { String ret; switch (ttype) { case TT_EOF: ret = "EOF"; break; case TT_EOL: ret = "EOL"; break; case TT_WORD: ret = sval; break; case TT_NUMBER: ret = "n=" + nval; break; case TT_NOTHING: ret = "NOTHING"; break; default: { /* * ttype is the first character of either a quoted string or * is an ordinary character. ttype can definitely not be less * than 0, since those are reserved values used in the previous * case statements */ if (ttype < 256 && ((ctype[ttype] & CT_QUOTE) != 0)) { ret = sval; break; } char s[] = new char[3]; s[0] = s[2] = '\''; s[1] = (char) ttype; ret = new String(s); break; } } return "Token[" + ret + "], line " + LINENO; } }Token['a'], line 10