/* * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package com.sun.xml.internal.dtdparser; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import java.io.CharConversionException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.Locale; /** * This is how the parser talks to its input entities, of all kinds. * The entities are in a stack. *

For internal entities, the character arrays are referenced here, * and read from as needed (they're read-only). External entities have * mutable buffers, that are read into as needed. *

Note: This maps CRLF (and CR) to LF without regard for * whether it's in an external (parsed) entity or not. The XML 1.0 spec * is inconsistent in explaining EOL handling; this is the sensible way. * * @author David Brownell * @author Janet Koenig * @version 1.4 00/08/05 */ public class InputEntity { private int start, finish; private char buf []; private int lineNumber = 1; private boolean returnedFirstHalf = false; private boolean maybeInCRLF = false; // name of entity (never main document or unnamed DTD PE) private String name; private InputEntity next; // for system and public IDs in diagnostics private InputSource input; // this is a buffer; some buffers can be replenished. private Reader reader; private boolean isClosed; private DTDEventListener errHandler; private Locale locale; private StringBuffer rememberedText; private int startRemember; // record if this is a PE, so endParsedEntity won't be called private boolean isPE; // InputStreamReader throws an internal per-read exception, so // we minimize reads. We also add a byte to compensate for the // "ungetc" byte we keep, so that our downstream reads are as // nicely sized as we can make them. final private static int BUFSIZ = 8 * 1024 + 1; final private static char newline [] = {'\n'}; public static InputEntity getInputEntity(DTDEventListener h, Locale l) { InputEntity retval = new InputEntity(); retval.errHandler = h; retval.locale = l; return retval; } private InputEntity() { } // // predicate: return true iff this is an internal entity reader, // and so may safely be "popped" as needed. external entities have // syntax to uphold; internal parameter entities have at most validity // constraints to monitor. also, only external entities get decent // location diagnostics. // public boolean isInternal() { return reader == null; } // // predicate: return true iff this is the toplevel document // public boolean isDocument() { return next == null; } // // predicate: return true iff this is a PE expansion (so that // LexicalEventListner.endParsedEntity won't be called) // public boolean isParameterEntity() { return isPE; } // // return name of current entity // public String getName() { return name; } // // use this for an external parsed entity // public void init(InputSource in, String name, InputEntity stack, boolean isPE) throws IOException, SAXException { input = in; this.isPE = isPE; reader = in.getCharacterStream(); if (reader == null) { InputStream bytes = in.getByteStream(); if (bytes == null) reader = XmlReader.createReader(new URL(in.getSystemId()) .openStream()); else if (in.getEncoding() != null) reader = XmlReader.createReader(in.getByteStream(), in.getEncoding()); else reader = XmlReader.createReader(in.getByteStream()); } next = stack; buf = new char[BUFSIZ]; this.name = name; checkRecursion(stack); } // // use this for an internal parsed entity; buffer is readonly // public void init(char b [], String name, InputEntity stack, boolean isPE) throws SAXException { next = stack; buf = b; finish = b.length; this.name = name; this.isPE = isPE; checkRecursion(stack); } private void checkRecursion(InputEntity stack) throws SAXException { if (stack == null) return; for (stack = stack.next; stack != null; stack = stack.next) { if (stack.name != null && stack.name.equals(name)) fatal("P-069", new Object[]{name}); } } public InputEntity pop() throws IOException { // caller has ensured there's nothing left to read close(); return next; } /** * returns true iff there's no more data to consume ... */ public boolean isEOF() throws IOException, SAXException { // called to ensure WF-ness of included entities and to pop // input entities appropriately ... EOF is not always legal. if (start >= finish) { fillbuf(); return start >= finish; } else return false; } /** * Returns the name of the encoding in use, else null; the name * returned is in as standard a form as we can get. */ public String getEncoding() { if (reader == null) return null; if (reader instanceof XmlReader) return ((XmlReader) reader).getEncoding(); // XXX prefer a java2std() call to normalize names... if (reader instanceof InputStreamReader) return ((InputStreamReader) reader).getEncoding(); return null; } /** * returns the next name char, or NUL ... faster than getc(), * and the common "name or nmtoken must be next" case won't * need ungetc(). */ public char getNameChar() throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { char c = buf[start++]; if (XmlChars.isNameChar(c)) return c; start--; } return 0; } /** * gets the next Java character -- might be part of an XML * text character represented by a surrogate pair, or be * the end of the entity. */ public char getc() throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { char c = buf[start++]; // [2] Char ::= #x0009 | #x000A | #x000D // | [#x0020-#xD7FF] // | [#xE000-#xFFFD] // plus surrogate _pairs_ representing [#x10000-#x10ffff] if (returnedFirstHalf) { if (c >= 0xdc00 && c <= 0xdfff) { returnedFirstHalf = false; return c; } else fatal("P-070", new Object[]{Integer.toHexString(c)}); } if ((c >= 0x0020 && c <= 0xD7FF) || c == 0x0009 // no surrogates! || (c >= 0xE000 && c <= 0xFFFD)) return c; // // CRLF and CR are both line ends; map both to LF, and // keep line count correct. // else if (c == '\r' && !isInternal()) { maybeInCRLF = true; c = getc(); if (c != '\n') ungetc(); maybeInCRLF = false; lineNumber++; return '\n'; } else if (c == '\n' || c == '\r') { // LF, or 2nd char in CRLF if (!isInternal() && !maybeInCRLF) lineNumber++; return c; } // surrogates... if (c >= 0xd800 && c < 0xdc00) { returnedFirstHalf = true; return c; } fatal("P-071", new Object[]{Integer.toHexString(c)}); } throw new EndOfInputException(); } /** * lookahead one character */ public boolean peekc(char c) throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { if (buf[start] == c) { start++; return true; } else return false; } return false; } /** * two character pushback is guaranteed */ public void ungetc() { if (start == 0) throw new InternalError("ungetc"); start--; if (buf[start] == '\n' || buf[start] == '\r') { if (!isInternal()) lineNumber--; } else if (returnedFirstHalf) returnedFirstHalf = false; } /** * optional grammatical whitespace (discarded) */ public boolean maybeWhitespace() throws IOException, SAXException { char c; boolean isSpace = false; boolean sawCR = false; // [3] S ::= #20 | #09 | #0D | #0A for (; ;) { if (finish <= start) fillbuf(); if (finish <= start) return isSpace; c = buf[start++]; if (c == 0x20 || c == 0x09 || c == '\n' || c == '\r') { isSpace = true; // // CR, LF are line endings ... CLRF is one, not two! // if ((c == '\n' || c == '\r') && !isInternal()) { if (!(c == '\n' && sawCR)) { lineNumber++; sawCR = false; } if (c == '\r') sawCR = true; } } else { start--; return isSpace; } } } /** * normal content; whitespace in markup may be handled * specially if the parser uses the content model. *

content terminates with markup delimiter characters, * namely ampersand (&) and left angle bracket (<). *

the document handler's characters() method is called * on all the content found */ public boolean parsedContent(DTDEventListener docHandler /*ElementValidator validator*/) throws IOException, SAXException { // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) int first; // first char to return int last; // last char to return boolean sawContent; // sent any chars? char c; // deliver right out of the buffer, until delimiter, EOF, // or error, refilling as we go for (first = last = start, sawContent = false; ; last++) { // buffer empty? if (last >= finish) { if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last; } if (isEOF()) // calls fillbuf return sawContent; first = start; last = first - 1; // incremented in loop continue; } c = buf[last]; // // pass most chars through ASAP; this inlines the code of // [2] !XmlChars.isChar(c) leaving only characters needing // special treatment ... line ends, surrogates, and: // 0x0026 == '&' // 0x003C == '<' // 0x005D == ']' // Comparisons ordered for speed on 'typical' text // if ((c > 0x005D && c <= 0xD7FF) // a-z and more || (c < 0x0026 && c >= 0x0020) // space & punct || (c > 0x003C && c < 0x005D) // A-Z & punct || (c > 0x0026 && c < 0x003C) // 0-9 & punct || c == 0x0009 || (c >= 0xE000 && c <= 0xFFFD) ) continue; // terminate on markup delimiters if (c == '<' || c == '&') break; // count lines if (c == '\n') { if (!isInternal()) lineNumber++; continue; } // External entities get CR, CRLF --> LF mapping // Internal ones got it already, and we can't repeat // else we break char ref handling!! if (c == '\r') { if (isInternal()) continue; docHandler.characters(buf, first, last - first); docHandler.characters(newline, 0, 1); sawContent = true; lineNumber++; if (finish > (last + 1)) { if (buf[last + 1] == '\n') last++; } else { // CR at end of buffer // XXX case not yet handled: CRLF here will look like two lines } first = start = last + 1; continue; } // ']]>' is a WF error -- must fail if we see it if (c == ']') { switch (finish - last) { // for suspicious end-of-buffer cases, get more data // into the buffer to rule out this sequence. case 2: if (buf[last + 1] != ']') continue; // FALLTHROUGH case 1: if (reader == null || isClosed) continue; if (last == first) throw new InternalError("fillbuf"); last--; if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last; } fillbuf(); first = last = start; continue; // otherwise any "]]>" would be buffered, and we can // see right away if that's what we have default: if (buf[last + 1] == ']' && buf[last + 2] == '>') fatal("P-072", null); continue; } } // correctly paired surrogates are OK if (c >= 0xd800 && c <= 0xdfff) { if ((last + 1) >= finish) { if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last + 1; } if (isEOF()) { // calls fillbuf fatal("P-081", new Object[]{Integer.toHexString(c)}); } first = start; last = first; continue; } if (checkSurrogatePair(last)) last++; else { last--; // also terminate on surrogate pair oddities break; } continue; } fatal("P-071", new Object[]{Integer.toHexString(c)}); } if (last == first) return sawContent; // validator.text (); docHandler.characters(buf, first, last - first); start = last; return true; } /** * CDATA -- character data, terminated by "]]>" and optionally * including unescaped markup delimiters (ampersand and left angle * bracket). This should otherwise be exactly like character data, * modulo differences in error report details. *

The document handler's characters() or ignorableWhitespace() * methods are invoked on all the character data found * * @param docHandler gets callbacks for character data * @param ignorableWhitespace if true, whitespace characters will * be reported using docHandler.ignorableWhitespace(); implicitly, * non-whitespace characters will cause validation errors * @param whitespaceInvalidMessage if true, ignorable whitespace * causes a validity error report as well as a callback */ public boolean unparsedContent(DTDEventListener docHandler, /*ElementValidator validator,*/ boolean ignorableWhitespace, String whitespaceInvalidMessage) throws IOException, SAXException { // [18] CDSect ::= CDStart CData CDEnd // [19] CDStart ::= '' Char*)) // [21] CDEnd ::= ']]>' // caller peeked the leading '<' ... if (!peek("![CDATA[", null)) return false; docHandler.startCDATA(); // only a literal ']]>' stops this ... int last; for (; ;) { // until ']]>' seen boolean done = false; char c; // don't report ignorable whitespace as "text" for // validation purposes. boolean white = ignorableWhitespace; for (last = start; last < finish; last++) { c = buf[last]; // // Reject illegal characters. // if (!XmlChars.isChar(c)) { white = false; if (c >= 0xd800 && c <= 0xdfff) { if (checkSurrogatePair(last)) { last++; continue; } else { last--; break; } } fatal("P-071", new Object[] {Integer.toHexString(buf[last])}); } if (c == '\n') { if (!isInternal()) lineNumber++; continue; } if (c == '\r') { // As above, we can't repeat CR/CRLF --> LF mapping if (isInternal()) continue; if (white) { if (whitespaceInvalidMessage != null) errHandler.error(new SAXParseException(DTDParser.messages.getMessage(locale, whitespaceInvalidMessage), null)); docHandler.ignorableWhitespace(buf, start, last - start); docHandler.ignorableWhitespace(newline, 0, 1); } else { // validator.text (); docHandler.characters(buf, start, last - start); docHandler.characters(newline, 0, 1); } lineNumber++; if (finish > (last + 1)) { if (buf[last + 1] == '\n') last++; } else { // CR at end of buffer // XXX case not yet handled ... as above } start = last + 1; continue; } if (c != ']') { if (c != ' ' && c != '\t') white = false; continue; } if ((last + 2) < finish) { if (buf[last + 1] == ']' && buf[last + 2] == '>') { done = true; break; } white = false; continue; } else { //last--; break; } } if (white) { if (whitespaceInvalidMessage != null) errHandler.error(new SAXParseException(DTDParser.messages.getMessage(locale, whitespaceInvalidMessage), null)); docHandler.ignorableWhitespace(buf, start, last - start); } else { // validator.text (); docHandler.characters(buf, start, last - start); } if (done) { start = last + 3; break; } start = last; if (isEOF()) fatal("P-073", null); } docHandler.endCDATA(); return true; } // return false to backstep at end of buffer) private boolean checkSurrogatePair(int offset) throws SAXException { if ((offset + 1) >= finish) return false; char c1 = buf[offset++]; char c2 = buf[offset]; if ((c1 >= 0xd800 && c1 < 0xdc00) && (c2 >= 0xdc00 && c2 <= 0xdfff)) return true; fatal("P-074", new Object[]{ Integer.toHexString(c1 & 0x0ffff), Integer.toHexString(c2 & 0x0ffff) }); return false; } /** * whitespace in markup (flagged to app, discardable) *

the document handler's ignorableWhitespace() method * is called on all the whitespace found */ public boolean ignorableWhitespace(DTDEventListener handler) throws IOException, SAXException { char c; boolean isSpace = false; int first; // [3] S ::= #20 | #09 | #0D | #0A for (first = start; ;) { if (finish <= start) { if (isSpace) handler.ignorableWhitespace(buf, first, start - first); fillbuf(); first = start; } if (finish <= start) return isSpace; c = buf[start++]; switch (c) { case '\n': if (!isInternal()) lineNumber++; // XXX handles Macintosh line endings wrong // fallthrough case 0x09: case 0x20: isSpace = true; continue; case '\r': isSpace = true; if (!isInternal()) lineNumber++; handler.ignorableWhitespace(buf, first, (start - 1) - first); handler.ignorableWhitespace(newline, 0, 1); if (start < finish && buf[start] == '\n') ++start; first = start; continue; default: ungetc(); if (isSpace) handler.ignorableWhitespace(buf, first, start - first); return isSpace; } } } /** * returns false iff 'next' string isn't as provided, * else skips that text and returns true. *

NOTE: two alternative string representations are * both passed in, since one is faster. */ public boolean peek(String next, char chars []) throws IOException, SAXException { int len; int i; if (chars != null) len = chars.length; else len = next.length(); // buffer should hold the whole thing ... give it a // chance for the end-of-buffer case and cope with EOF // by letting fillbuf compact and fill if (finish <= start || (finish - start) < len) fillbuf(); // can't peek past EOF if (finish <= start) return false; // compare the string; consume iff it matches if (chars != null) { for (i = 0; i < len && (start + i) < finish; i++) { if (buf[start + i] != chars[i]) return false; } } else { for (i = 0; i < len && (start + i) < finish; i++) { if (buf[start + i] != next.charAt(i)) return false; } } // if the first fillbuf didn't get enough data, give // fillbuf another chance to read if (i < len) { if (reader == null || isClosed) return false; // // This diagnostic "knows" that the only way big strings would // fail to be peeked is where it's a symbol ... e.g. for an // construct. That knowledge could also be applied // to get rid of the symbol length constraint, since having // the wrong symbol is a fatal error anyway ... // if (len > buf.length) fatal("P-077", new Object[]{new Integer(buf.length)}); fillbuf(); return peek(next, chars); } start += len; return true; } // // Support for reporting the internal DTD subset, so // declarations can be recreated. This is collected as a single // string; such subsets are normally small, and many applications // don't even care about this. // public void startRemembering() { if (startRemember != 0) throw new InternalError(); startRemember = start; } public String rememberText() { String retval; // If the internal subset crossed a buffer boundary, we // created a temporary buffer. if (rememberedText != null) { rememberedText.append(buf, startRemember, start - startRemember); retval = rememberedText.toString(); } else retval = new String(buf, startRemember, start - startRemember); startRemember = 0; rememberedText = null; return retval; } private InputEntity getTopEntity() { InputEntity current = this; // don't report locations within internal entities! while (current != null && current.input == null) current = current.next; return current == null ? this : current; } /** * Returns the public ID of this input source, if known */ public String getPublicId() { InputEntity where = getTopEntity(); if (where == this) return input.getPublicId(); return where.getPublicId(); } /** * Returns the system ID of this input source, if known */ public String getSystemId() { InputEntity where = getTopEntity(); if (where == this) return input.getSystemId(); return where.getSystemId(); } /** * Returns the current line number in this input source */ public int getLineNumber() { InputEntity where = getTopEntity(); if (where == this) return lineNumber; return where.getLineNumber(); } /** * returns -1; maintaining column numbers hurts performance */ public int getColumnNumber() { return -1; // not maintained (speed) } // // n.b. for non-EOF end-of-buffer cases, reader should return // at least a handful of bytes so various lookaheads behave. // // two character pushback exists except at first; characters // represented by surrogate pairs can't be pushed back (they'd // only be in character data anyway). // // DTD exception thrown on char conversion problems; line number // will be low, as a rule. // private void fillbuf() throws IOException, SAXException { // don't touched fixed buffers, that'll usually // change entity values (and isn't needed anyway) // likewise, ignore closed streams if (reader == null || isClosed) return; // if remembering DTD text, copy! if (startRemember != 0) { if (rememberedText == null) rememberedText = new StringBuffer(buf.length); rememberedText.append(buf, startRemember, start - startRemember); } boolean extra = (finish > 0) && (start > 0); int len; if (extra) // extra pushback start--; len = finish - start; System.arraycopy(buf, start, buf, 0, len); start = 0; finish = len; try { len = buf.length - len; len = reader.read(buf, finish, len); } catch (UnsupportedEncodingException e) { fatal("P-075", new Object[]{e.getMessage()}); } catch (CharConversionException e) { fatal("P-076", new Object[]{e.getMessage()}); } if (len >= 0) finish += len; else close(); if (extra) // extra pushback start++; if (startRemember != 0) // assert extra == true startRemember = 1; } public void close() { try { if (reader != null && !isClosed) reader.close(); isClosed = true; } catch (IOException e) { /* NOTHING */ } } private void fatal(String messageId, Object params []) throws SAXException { SAXParseException x = new SAXParseException(DTDParser.messages.getMessage(locale, messageId, params), null); // not continuable ... e.g. WF errors close(); errHandler.fatalError(x); throw x; } }