/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
// NOTE: Add I18N support to this class when JDK gets the ability to
// defer selection of locale for exception messages ... use the same
// technique for both.
/**
* This handles several XML-related tasks that normal java.io Readers
* don't support, inluding use of IETF standard encoding names and
* automatic detection of most XML encodings. The former is needed
* for interoperability; the latter is needed to conform with the XML
* spec. This class also optimizes reading some common encodings by
* providing low-overhead unsynchronized Reader support.
* <p/>
* <P> Note that the autodetection facility should be used only on
* data streams which have an unknown character encoding. For example,
* <p/>
* <P> Note that XML processors are only required to support UTF-8 and
* UTF-16 character encodings. Autodetection permits the underlying Java
* implementation to provide support for many other encodings, such as
* US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
*
* @author David Brownell
* @author Janet Koenig
* @version 1.3 00/02/24
*/
// package private
private boolean closed;
//
// This class always delegates I/O to a reader, which gets
// its data from the very beginning of the XML text. It needs
// to use a pushback stream since (a) autodetection can read
// partial UTF-8 characters which need to be fully processed,
// (b) the "Unicode" readers swallow characters that they think
// are byte order marks, so tests fail if they don't see the
// real byte order mark.
//
// It's got do this efficiently: character I/O is solidly on the
// critical path. (So keep buffer length over 2 Kbytes to avoid
// excess buffering. Many URL handlers stuff a BufferedInputStream
// between here and the real data source, and larger buffers keep
// that from slowing you down.)
//
/**
* Constructs the reader from an input stream, auto-detecting
* the encoding to use according to the heuristic specified
* in the XML 1.0 recommendation.
*
* @param in the input stream from which the reader is constructed
* @throws IOException on error, such as unrecognized encoding
*/
}
/**
* Creates a reader supporting the given encoding, mapping
* from standard encoding names to ones that understood by
* Java where necessary.
*
* @param in the input stream from which the reader is constructed
* @param encoding the IETF standard name of the encoding to use;
* if null, auto-detection is used.
* @throws IOException on error, including unrecognized encoding
*/
throws IOException {
return new Utf8Reader(in);
return new AsciiReader(in);
// plus numerous aliases ...
)
return new Iso8859_1Reader(in);
//
// What we really want is an administerable resource mapping
// file resource, "readers/mapping.props", holding and a set
// of readers in that (sub)package... defaulting to this call
// only if no better choice is available.
//
}
//
// JDK doesn't know all of the standard encoding names, and
// in particular none of the EBCDIC ones IANA defines (and
// which IBM encourages).
//
static {
// NOTE: no support for ISO-10646-UCS-4 yet.
// IANA also defines two that JDK 1.2 doesn't handle:
// EBCDIC-CP-GR --> CP423
// EBCDIC-CP-TR --> CP905
}
// returns an encoding name supported by JDK >= 1.1.6
// for some cases required by the XML spec
}
/**
* Returns the standard name of the encoding in use
*/
return assignedEncoding;
}
super(stream);
byte buf [];
int len;
if (stream instanceof PushbackInputStream)
else
//
// See if we can figure out the character encoding used
// in this file by peeking at the first few bytes.
//
buf = new byte[4];
if (len > 0)
if (len == 4)
case 0:
// 00 3c 00 3f == illegal UTF-16 big-endian
return;
}
// else it's probably UCS-4
break;
case '<': // 0x3c: the most common cases!
// First character is '<'; could be XML without
// an XML directive such as "<hello>", "<!-- ...",
// and so on.
default:
break;
// 3c 00 3f 00 == illegal UTF-16 little endian
case 0x00:
return;
}
// else probably UCS-4
break;
// 3c 3f 78 6d == ASCII and supersets '<?xm'
case '?':
break;
//
// One of several encodings could be used:
// Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
//
return;
}
break;
// 4c 6f a7 94 ... some EBCDIC code page
case 0x4c:
return;
}
// whoops, treat as UTF-8
break;
// UTF-16 big-endian
case 0xfe:
break;
return;
// UTF-16 little-endian
case 0xff:
break;
return;
// default ... no XML declaration
default:
break;
}
//
// If all else fails, assume XML without a declaration, and
// using UTF-8 encoding.
//
}
/*
* Read the encoding decl on the stream, knowing that it should
* be readable using the specified encoding (basically, ASCII or
* EBCDIC). The body of the document may use a wider range of
* the specified encoding as soon as we can. (ASCII is a subset
* of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
* has a variety of "code pages" that have these characters as
* a common subset.)
*/
throws IOException {
byte buffer [] = new byte[MAXPUSHBACK];
int len;
Reader r;
int c;
//
// Buffer up a bunch of input, and set up to read it in
// the specified encoding ... we can skip the first four
// bytes since we know that "<?xm" was read to determine
// what encoding to use!
//
encoding);
//
// Next must be "l" (and whitespace) else we conclude
// error and choose UTF-8.
//
if ((c = r.read()) != 'l') {
return;
}
//
// Then, we'll skip any
// S version="..." [or single quotes]
// bit and get any subsequent
// S encoding="..." [or single quotes]
//
// We put an arbitrary size limit on how far we read; lots
// of space will break this algorithm.
//
boolean sawEq = false;
char quoteChar = 0;
boolean sawQuestion = false;
if ((c = r.read()) == -1)
break;
if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
continue;
// ... but require at least a little!
if (i == 0)
break;
// terminate the loop ASAP
if (c == '?')
sawQuestion = true;
else if (sawQuestion) {
if (c == '>')
break;
sawQuestion = false;
}
// did we get the "key =" bit yet?
if (Character.isWhitespace((char) c))
continue;
sawEq = false;
} else if (Character.isWhitespace((char) c)) {
} else if (c == '=') {
sawEq = true;
quoteChar = 0;
} else
continue;
}
// space before quoted value
if (Character.isWhitespace((char) c))
continue;
if (c == '"' || c == '\'') {
if (quoteChar == 0) {
quoteChar = (char) c;
continue;
} else if (c == quoteChar) {
// [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
c = assignedEncoding.charAt(i);
if ((c >= 'A' && c <= 'Z')
|| (c >= 'a' && c <= 'z'))
continue;
if (i == 0)
break XmlDecl;
if (i > 0 && (c == '-'
|| (c >= '0' && c <= '9')
|| c == '.' || c == '_'))
continue;
// map illegal names to UTF-8 default
break XmlDecl;
}
return;
} else {
continue;
}
}
}
}
}
throws IOException {
}
/**
* Reads the number of characters read into the buffer, or -1 on EOF.
*/
int val;
if (closed)
return -1; // throw new IOException ("closed");
if (val == -1)
close();
return val;
}
/**
* Reads a single character.
*/
int val;
if (closed)
throw new IOException("closed");
if (val == -1)
close();
return val;
}
/**
*/
public boolean markSupported() {
}
/**
* Sets a mark allowing a limited number of characters to
* be "peeked", by reading and then resetting.
*
* @param value how many characters may be "peeked".
*/
}
/**
* Resets the current position to the last marked position.
*/
}
/**
* Skips a specified number of characters.
*/
}
/**
* Returns true iff input characters are known to be ready.
*/
}
/**
* Closes the reader.
*/
if (closed)
return;
closed = true;
}
//
// Delegating to a converter module will always be slower than
// direct conversion. Use a similar approach for any other
// readers that need to be particularly fast; only block I/O
// speed matters to this package. For UTF-16, separate readers
// for big and little endian streams make a difference, too;
// fewer conditionals in the critical path!
//
protected byte buffer [];
super(stream);
buffer = new byte[8192];
}
}
// caller shouldn't read again
}
}
}
//
// We want this reader, to make the default encoding be as fast
// as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2)
// InputStreamReader works, but 20+% slower speed isn't OK for
//
// 2nd half of UTF-8 surrogate pair
private char nextChar;
super(stream);
}
int i = 0, c = 0;
if (len <= 0)
return 0;
// Consume remaining half of any surrogate pair immediately
if (nextChar != 0) {
nextChar = 0;
}
while (i < len) {
// stop or read data if needed
c = -1;
break;
}
start = 0;
if (finish <= 0) {
this.close();
c = -1;
break;
}
}
//
// RFC 2279 describes UTF-8; there are six encodings.
// Each encoding takes a fixed number of characters
// (1-6 bytes) and is flagged by a bit pattern in the
// first byte. The five and six byte-per-character
// encodings address characters which are disallowed
// in XML documents, as do some four byte ones.
//
//
// Single byte == ASCII. Common; optimize.
//
if ((c & 0x80) == 0x00) {
// 0x0000 <= c <= 0x007f
start++;
continue;
}
//
// Multibyte chars -- check offsets optimistically,
// ditto the "10xx xxxx" format for subsequent bytes
//
try {
// 2 bytes
// 0x0080 <= c <= 0x07ff
// 3 bytes
// 0x0800 <= c <= 0xffff
// 4 bytes
// 0x0001 0000 <= c <= 0x001f ffff
// Unicode supports c <= 0x0010 ffff ...
if (c > 0x0010ffff)
throw new CharConversionException("UTF-8 encoding of character 0x00"
+ Integer.toHexString(c)
+ " can't be converted to Unicode.");
// Convert UCS-4 char to surrogate pair (UTF-16)
c -= 0x10000;
c = 0xD800 + (c >> 10);
// 5 and 6 byte versions are XML WF errors, but
// typically come from mislabeled encodings
} else
throw new CharConversionException("Unconvertible UTF-8 character"
+ " beginning with 0x"
} catch (ArrayIndexOutOfBoundsException e) {
// off > length && length >= buffer.length
c = 0;
}
//
// if the buffer held only a partial character,
// compact it and try to read the rest of the
// character. worst case involves three
// single-byte reads -- quite rare.
//
start = 0;
if (off < 0) {
this.close();
throw new CharConversionException("Partial UTF-8 char");
}
continue;
}
//
// check the format of the non-initial bytes
//
this.close();
throw new CharConversionException("Malformed UTF-8 char -- "
+ "is an XML encoding declaration missing?");
}
}
//
// If this needed a surrogate pair, consume ASAP
//
nextChar = 0;
}
}
if (i > 0)
return i;
return (c == -1) ? -1 : 0;
}
}
//
// We want ASCII and ISO-8859 Readers since they're the most common
// encodings in the US and Europe, and we don't want performance
// regressions for them. They're also easy to implement efficiently,
// since they're bitmask subsets of UNICODE.
//
// XXX haven't benchmarked these readers vs what we get out of JDK.
//
super(in);
}
int i, c;
return -1;
for (i = 0; i < len; i++) {
start = 0;
if (finish <= 0) {
if (finish <= 0)
this.close();
break;
}
}
if ((c & 0x80) != 0)
throw new CharConversionException("Illegal ASCII character, 0x"
}
return -1;
return i;
}
}
super(in);
}
int i;
return -1;
for (i = 0; i < len; i++) {
start = 0;
if (finish <= 0) {
if (finish <= 0)
this.close();
break;
}
}
}
return -1;
return i;
}
}
}