/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* This implements parsing of XML 1.0 DTDs.
* <p/>
* This conforms to the portion of the XML 1.0 specification related
* to the external DTD subset.
* <p/>
* For multi-language applications (such as web servers using XML
* processing to create dynamic content), a method supports choosing
* a locale for parser diagnostics which is both understood by the
* message recipient and supported by the parser.
* <p/>
* This parser produces a stream of parse events. It supports some
* features (exposing comments, CDATA sections, and entity references)
* which are not required to be reported by conformant XML processors.
*
* @author David Brownell
* @author Janet Koenig
* @author Kohsuke KAWAGUCHI
* @version $Id: DTDParser.java,v 1.2 2009-04-16 15:25:49 snajper Exp $
*/
public class DTDParser {
// stack of input entities being merged
// temporaries reused during parsing
private char nameTmp [];
// temporary DTD parsing state
private boolean doLexicalPE;
// DTD state, used during parsing
// private SimpleHashtable elements = new SimpleHashtable (47);
// exposed to package-private subclass
// listeners for DTD parsing events
// string constants -- use these copies so "==" works
// package private
/**
* Used by applications to request locale for diagnostics.
*
* @param l The locale to use, or null to use system defaults
* (which may include only message IDs).
*/
"P-078", new Object[]{l}));
}
locale = l;
}
/**
* Returns the diagnostic locale.
*/
return locale;
}
/**
* Chooses a client locale to use for diagnostics, using the first
* language specified in the list that is supported by this parser.
* That locale is then set using <a href="#setLocale(java.util.Locale)">
* setLocale()</a>. Such a list could be provided by a variety of user
* preference mechanisms, including the HTTP <em>Accept-Language</em>
* header field.
*
* @param languages Array of language specifiers, ordered with the most
* preferable one at the front. For example, "en-ca" then "fr-ca",
* followed by "zh_CN". Both RFC 1766 and Java styles are supported.
* @return The chosen locale, or null.
* @see MessageCatalog
*/
throws SAXException {
if (l != null) {
setLocale(l);
}
return l;
}
/**
* Lets applications control entity resolution.
*/
resolver = r;
}
/**
* Returns the object used to resolve entities
*/
return resolver;
}
/**
* Used by applications to set handling of DTD parsing events.
*/
public String getPublicId() {
return DTDParser.this.getPublicId();
}
public String getSystemId() {
return DTDParser.this.getSystemId();
}
public int getLineNumber() {
return DTDParser.this.getLineNumber();
}
public int getColumnNumber() {
return DTDParser.this.getColumnNumber();
}
});
}
/**
* Returns the handler used to for DTD parsing events.
*/
return dtdHandler;
}
/**
* Parse a DTD.
*/
throws IOException, SAXException {
init();
}
/**
* Parse a DTD.
*/
throws IOException, SAXException {
init();
// System.out.println ("parse (\"" + uri + "\")");
// If custom resolver punts resolution to parser, handle it ...
// ... or if custom resolver doesn't correctly construct the
// input entity, patch it up enough so relative URIs work, and
// issue a warning to minimize later confusion.
}
}
// makes sure the parser is reset to "before a document"
private void init() {
// alloc temporary data used in parsing
strTmp = new StringBuffer();
nameTmp = new char[20];
// reset doc info
// isInAttribute = false;
doLexicalPE = false;
// elements.clear ();
// initialize predefined references ... re-interpreted later
if (dtdHandler == null)
dtdHandler = new DTDHandlerBase();
}
}
////////////////////////////////////////////////////////////////
//
// parsing is by recursive descent, code roughly
// following the BNF rules except tweaked for simple
// lookahead. rules are more or less in numeric order,
// except where code sharing suggests other structures.
//
// a classic benefit of recursive descent parsers: it's
// relatively easy to get diagnostics that make sense.
//
////////////////////////////////////////////////////////////////
throws IOException, SAXException {
fatal("P-000");
try {
// [30] extSubset ::= TextDecl? extSubsetDecl
// [31] extSubsetDecl ::= ( markupdecl | conditionalSect
// | PEReference | S )*
// ... same as [79] extPE, which is where the code is
}
afterRoot();
dtdHandler.endDTD();
} catch (EndOfInputException e) {
if (!in.isDocument()) {
do { // force a relevant URI and line number
} while (in.isInternal());
} else {
}
} catch (RuntimeException e) {
// Don't discard location that triggered the exception
// ## Should properly wrap exception
e.printStackTrace();
getPublicId(), getSystemId(),
getLineNumber(), getColumnNumber());
} finally {
// recycle temporary data used during parsing
// ditto input sources etc
}
// get rid of all DTD info ... some of it would be
// useful for editors etc, investigate later.
// elements.clear();
}
}
// Make sure all IDREFs match declared ID attributes. We scan
// after the document element is parsed, since XML allows forward
// references, and only now can we know if they're all resolved.
e.hasMoreElements();
) {
}
}
// role is for diagnostics
throws IOException, SAXException {
// [3] S ::= (#x20 | #x9 | #xd | #xa)+
if (!maybeWhitespace()) {
}
}
// S?
private boolean maybeWhitespace()
throws IOException, SAXException {
if (!doLexicalPE)
return in.maybeWhitespace();
// see getc() for the PE logic -- this lets us splice
// expansions of PEs in "anywhere". getc() has smarts,
// so for external PEs we don't bypass it.
// XXX we can marginally speed PE handling, and certainly
// be cleaner (hence potentially more correct), by using
// the observations that expanded PEs only start and stop
// where whitespace is allowed. getc wouldn't need any
// "lexical" PE expansion logic, and no other method needs
// to handle termination of PEs. (parsing of literals would
// still need to pop entities, but not parsing of references
// in content.)
char c = getc();
boolean saw = false;
while (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
saw = true;
// this gracefully ends things when we stop playing
// with internal parameters. caller should have a
// grammar rule allowing whitespace at end of entity.
return saw;
c = getc();
}
ungetc();
return saw;
}
throws IOException, SAXException {
}
throws IOException, SAXException {
// [5] Name ::= (Letter|'_'|':') (Namechar)*
char c = getc();
ungetc();
return null;
}
return nameCharString(c);
}
// Used when parsing enumerations
throws IOException, SAXException {
// [7] Nmtoken ::= (Namechar)+
char c = getc();
if (!XmlChars.isNameChar(c))
return nameCharString(c).name;
}
// n.b. this gets used when parsing attribute values (for
// internal references) so we can't use strTmp; it's also
// a hotspot for CPU and memory in the parser (called at least
// once for each element) so this has been optimized a bit.
throws IOException, SAXException {
int i = 1;
nameTmp[0] = c;
for (; ;) {
break;
}
nameTmp[i++] = c;
}
}
//
// much similarity between parsing entity values in DTD
// and attribute values (in DTD or content) ... both follow
// literal parsing rules, newline canonicalization, etc
//
// leaves value in 'strTmp' ... either a "replacement text" (4.5),
// or else partially normalized attribute value (the first bit
// of 3.3.3's spec, without the "if not CDATA" bits).
//
throws IOException, SAXException {
// [9] EntityValue ::=
// '"' ([^"&%] | Reference | PEReference)* '"'
// | "'" ([^'&%] | Reference | PEReference)* "'"
// [10] AttValue ::=
// '"' ([^"&] | Reference )* '"'
// | "'" ([^'&] | Reference )* "'"
char c;
fatal("P-007");
}
// don't report entity expansions within attributes,
// they're reported "fully expanded" via SAX
// isInAttribute = !isEntityValue;
// get value into strTmp
strTmp = new StringBuffer();
// expanded entities can't terminate the literal!
for (; ;) {
// we don't report end of parsed entities
// within attributes (no SAX hooks)
continue;
}
break;
}
//
// Basically the "reference in attribute value"
// row of the chart in section 4.4 of the spec
//
if (c == '&') {
if (entityName != null) {
// 4.4 says: bypass these here ... we'll catch
// forbidden refs to unparsed entities on use
if (isEntityValue) {
continue;
}
// character references are always included immediately
} else if ((c = getc()) == '#') {
int tmp = parseCharNumber();
if (tmp > 0xffff) {
if (tmp == 2)
} else
} else
fatal("P-009");
continue;
}
// expand parameter entities only within entity value literals
if (c == '%' && isEntityValue) {
if (entityName != null) {
continue;
} else
fatal("P-011");
}
// For attribute values ...
if (!isEntityValue) {
// 3.3.3 says whitespace normalizes to space...
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
continue;
}
// "<" not legal in parsed literals ...
if (c == '<')
fatal("P-012");
}
}
// isInAttribute = false;
}
// does a SINGLE expansion of the entity (often reparsed later)
boolean isEntityValue)
throws IOException, SAXException {
if (entity instanceof InternalEntity) {
} else if (entity instanceof ExternalEntity) {
if (!isEntityValue) // must be a PE ...
// XXX if this returns false ...
//
// Note: much confusion about whether spec requires such
// errors to be fatal in many cases, but none about whether
// it allows "normal" errors to be unrecoverable!
//
}
}
// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
// for PUBLIC and SYSTEM literals, also "<?xml ...type='literal'?>'
// NOTE: XML spec should explicitly say that PE ref syntax is
// ignored in PIs, comments, SystemLiterals, and Pubid Literal
// values ... can't process the XML spec's own DTD without doing
// that for comments.
throws IOException, SAXException {
// use in.getc to bypass PE processing
});
char c;
strTmp = new StringBuffer();
}
// [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'")
// [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%]
&& !(c >= 'A' && c <= 'Z')
&& !(c >= 'a' && c <= 'z'))
}
strTmp = new StringBuffer();
return normalize(false);
}
// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
// handled by: InputEntity.parsedContent()
throws IOException, SAXException {
// [15] Comment ::= '<!--'
// ( (Char - '-') | ('-' (Char - '-'))*
// '-->'
return false;
boolean savedLexicalPE = doLexicalPE;
boolean saveCommentText;
doLexicalPE = false;
saveCommentText = false;
if (saveCommentText)
strTmp = new StringBuffer();
for (; ;) {
try {
// bypass PE expansion, but permit PEs
// to complete ... valid docs won't care.
for (; ;) {
int c = getc();
if (c == '-') {
c = getc();
if (c != '-') {
if (saveCommentText)
ungetc();
continue;
}
break oneComment;
}
if (saveCommentText)
}
} catch (EndOfInputException e) {
//
// This is fatal EXCEPT when we're processing a PE...
// in which case a validating processor reports an error.
// External PEs are easy to detect; internal ones we
// infer by being an internal entity outside an element.
//
if (in.isInternal()) {
}
fatal("P-017");
}
}
if (saveCommentText)
return true;
}
throws IOException, SAXException {
// [16] PI ::= '<?' PITarget
// (S (Char* - (Char* '?>' Char*)))?
// '?>'
// [17] PITarget ::= Name - (('X'|'x')('M'|'m')('L'|'l')
boolean savedLexicalPE = doLexicalPE;
return false;
doLexicalPE = false;
fatal("P-018");
}
fatal("P-019");
}
}
if (maybeWhitespace()) {
strTmp = new StringBuffer();
try {
for (; ;) {
// use in.getc to bypass PE processing
//Reached the end of PI.
break;
}
} catch (EndOfInputException e) {
fatal("P-021");
}
} else {
fatal("P-022");
}
}
return true;
}
// [18] CDSect ::= CDStart CData CDEnd
// [19] CDStart ::= '<![CDATA['
// [20] CData ::= (Char* - (Char* ']]>' Char*))
// [21] CDEnd ::= ']]>'
//
// ... handled by InputEntity.unparsedContent()
// collapsing several rules together ...
// simpler than attribute literals -- no reference parsing!
throws IOException, SAXException {
// [24] VersionInfo ::= S 'version' Eq \'|\" versionNum \'|\"
// [80] EncodingDecl ::= S 'encoding' Eq \'|\" EncName \'|\"
// [32] SDDecl ::= S 'standalone' Eq \'|\" ... \'|\"
if (!maybeWhitespace()) {
if (!must) {
return null;
}
// NOTREACHED
}
if (must) {
} else {
// To ensure that the whitespace is there so that when we
// check for the next attribute we assure that the
// whitespace still exists.
ungetc();
return null;
}
}
// [25] Eq ::= S? '=' S?
}
throws IOException, SAXException {
// [26] versionNum ::= ([a-zA-Z0-9_.:]| '-')+
for (int i = 0; i < length; i++) {
if (!((c >= '0' && c <= '9')
|| c == '_' || c == '.'
|| (c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == ':' || c == '-')
)
}
}
}
// common code used by most markup declarations
// ... S (Q)Name ...
throws IOException, SAXException {
name = maybeGetName();
return name;
}
private boolean maybeMarkupDecl()
throws IOException, SAXException {
// [29] markupdecl ::= elementdecl | Attlistdecl
// | EntityDecl | NotationDecl | PI | Comment
return maybeElementDecl()
|| maybeAttlistDecl()
|| maybeEntityDecl()
|| maybeNotationDecl()
|| maybePI(false)
|| maybeComment(false);
}
// [33] LanguageId ::= Langcode ('-' Subcode)*
// [34] Langcode ::= ISO639Code | IanaCode | UserCode
// [35] ISO639Code ::= [a-zA-Z] [a-zA-Z]
// [36] IanaCode ::= [iI] '-' SubCode
// [37] UserCode ::= [xX] '-' SubCode
// [38] SubCode ::= [a-zA-Z]+
// the ISO and IANA codes (and subcodes) are registered,
// but that's neither a WF nor a validity constraint.
int nextSuffix;
char c;
return false;
if (c == '-') { // IANA, or user, code
if (!(c == 'i' || c == 'I' || c == 'x' || c == 'X'))
return false;
nextSuffix = 1;
} else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
// 2 letter ISO code, or error
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
return false;
nextSuffix = 2;
} else
return false;
// here "suffix" ::= '-' [a-zA-Z]+ suffix*
if (c != '-')
break;
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
break;
}
}
}
//
// CHAPTER 3: Logical Structures
//
/**
* To validate, subclassers should at this time make sure that
* values are of the declared types:<UL>
* <LI> ID and IDREF(S) values are Names
* <LI> NMTOKEN(S) are Nmtokens
* <LI> ENUMERATION values match one of the tokens
* <LI> NOTATION values match a notation name
* <LI> ENTITIY(IES) values match an unparsed external entity
* </UL>
* <p/>
* <P> Separately, make sure IDREF values match some ID
* provided in the document (in the afterRoot method).
*/
/* void validateAttributeSyntax (Attribute attr, String value)
throws DTDParseException {
// ID, IDREF(S) ... values are Names
if (Attribute.ID == attr.type()) {
if (!XmlNames.isName (value))
error ("V-025", new Object [] { value });
Boolean b = (Boolean) ids.getNonInterned (value);
if (b == null || b.equals (Boolean.FALSE))
ids.put (value.intern (), Boolean.TRUE);
else
error ("V-026", new Object [] { value });
} else if (Attribute.IDREF == attr.type()) {
if (!XmlNames.isName (value))
error ("V-027", new Object [] { value });
Boolean b = (Boolean) ids.getNonInterned (value);
if (b == null)
ids.put (value.intern (), Boolean.FALSE);
} else if (Attribute.IDREFS == attr.type()) {
StringTokenizer tokenizer = new StringTokenizer (value);
Boolean b;
boolean sawValue = false;
while (tokenizer.hasMoreTokens ()) {
value = tokenizer.nextToken ();
if (!XmlNames.isName (value))
error ("V-027", new Object [] { value });
b = (Boolean) ids.getNonInterned (value);
if (b == null)
ids.put (value.intern (), Boolean.FALSE);
sawValue = true;
}
if (!sawValue)
error ("V-039", null);
// NMTOKEN(S) ... values are Nmtoken(s)
} else if (Attribute.NMTOKEN == attr.type()) {
if (!XmlNames.isNmtoken (value))
error ("V-028", new Object [] { value });
} else if (Attribute.NMTOKENS == attr.type()) {
StringTokenizer tokenizer = new StringTokenizer (value);
boolean sawValue = false;
while (tokenizer.hasMoreTokens ()) {
value = tokenizer.nextToken ();
if (!XmlNames.isNmtoken (value))
error ("V-028", new Object [] { value });
sawValue = true;
}
if (!sawValue)
error ("V-032", null);
// ENUMERATION ... values match one of the tokens
} else if (Attribute.ENUMERATION == attr.type()) {
for (int i = 0; i < attr.values().length; i++)
if (value.equals (attr.values()[i]))
return;
error ("V-029", new Object [] { value });
// NOTATION values match a notation name
} else if (Attribute.NOTATION == attr.type()) {
//
// XXX XML 1.0 spec should probably list references to
// externally defined notations in standalone docs as
// validity errors. Ditto externally defined unparsed
// entities; neither should show up in attributes, else
// one needs to read the external declarations in order
// to make sense of the document (exactly what tagging
// a doc as "standalone" intends you won't need to do).
//
for (int i = 0; i < attr.values().length; i++)
if (value.equals (attr.values()[i]))
return;
error ("V-030", new Object [] { value });
// ENTITY(IES) values match an unparsed entity(ies)
} else if (Attribute.ENTITY == attr.type()) {
// see note above re standalone
if (!isUnparsedEntity (value))
error ("V-031", new Object [] { value });
} else if (Attribute.ENTITIES == attr.type()) {
StringTokenizer tokenizer = new StringTokenizer (value);
boolean sawValue = false;
while (tokenizer.hasMoreTokens ()) {
value = tokenizer.nextToken ();
// see note above re standalone
if (!isUnparsedEntity (value))
error ("V-031", new Object [] { value });
sawValue = true;
}
if (!sawValue)
error ("V-040", null);
} else if (Attribute.CDATA != attr.type())
throw new InternalError (attr.type());
}
*/
/*
private boolean isUnparsedEntity (String name)
{
Object e = entities.getNonInterned (name);
if (e == null || !(e instanceof ExternalEntity))
return false;
return ((ExternalEntity)e).notation != null;
}
*/
private boolean maybeElementDecl()
throws IOException, SAXException {
// [45] elementDecl ::= '<!ELEMENT' S Name S contentspec S? '>'
// [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
return false;
// n.b. for content models where inter-element whitespace is
// ignorable, we mark that fact here.
// Element element = (Element) elements.get (name);
// boolean declEffective = false;
/*
if (element != null) {
if (element.contentModel() != null) {
error ("V-012", new Object [] { name });
} // else <!ATTLIST name ...> came first
} else {
element = new Element(name);
elements.put (element.name(), element);
declEffective = true;
}
*/
else {
// declEffective = true;
}
short modelType;
whitespace("F-000");
/// // leave element.contentModel as null for this case.
/// element.setContentModel(new StringModel(StringModelType.ANY));
} else {
}
char c = getc();
if (c != '>')
/// dtdHandler.elementDecl(element);
return true;
}
// We're leaving the content model as a regular expression;
// it's an efficient natural way to express such things, and
// libraries often interpret them. No whitespace in the
// model we store, though!
/**
* returns content model type.
*/
throws IOException, SAXException {
// [47] children ::= (choice|seq) ('?'|'*'|'+')?
strTmp = new StringBuffer();
short modelType;
if (peek("#PCDATA")) {
} else {
}
return modelType;
}
// '(' S? already consumed
// matching ')' must be in "start" entity if validating
throws IOException, SAXException {
// [48] cp ::= (Name|choice|seq) ('?'|'*'|'+')?
// [49] choice ::= '(' S? cp (S? '|' S? cp)* S? ')'
// [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
boolean decided = false;
char type = 0;
// ContentModel retval, temp, current;
// retval = temp = current = null;
do {
tag = maybeGetName();
// temp = new ElementModel(tag);
// getFrequency((RepeatableContent)temp);
///->
///<-
} else if (peek("(")) {
// temp = getcps(element, next);
// getFrequency(temp);
///->
/// getFrequency(); <- this looks like a bug
///<-
} else
if (decided) {
char c = getc();
// if (current != null) {
// current.addChild(temp);
// }
if (c == type) {
continue;
} else if (c == '\u0029') { // rparen
ungetc();
continue;
} else {
new Object[]{
new Character(c),
});
}
} else {
switch (type) {
case '|':
case ',':
break;
default:
// retval = temp;
ungetc();
continue;
}
// retval = (ContentModel)current;
decided = true;
// current.addChild(temp);
}
} while (!peek(")"));
// return retval;
}
switch (type) {
case '|':
return;
case ',':
return;
default:
throw new Error(); //assertion failed.
}
}
private short getFrequency()
throws IOException, SAXException {
final char c = getc();
if (c == '?') {
// original.setRepeat(Repeat.ZERO_OR_ONE);
} else if (c == '+') {
// original.setRepeat(Repeat.ONE_OR_MORE);
} else if (c == '*') {
// original.setRepeat(Repeat.ZERO_OR_MORE);
} else {
ungetc();
return DTDEventListener.OCCURENCE_ONCE;
}
}
// '(' S? '#PCDATA' already consumed
// matching ')' must be in "start" entity if validating
throws IOException, SAXException {
// [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
// | '(' S? '#PCDATA' S? ')'
// element.setContentModel(new StringModel(StringModelType.PCDATA));
return;
}
// l.add(new StringModel(StringModelType.PCDATA));
while (peek("|")) {
doLexicalPE = true;
name = maybeGetName();
} else {
}
}
// ChoiceModel cm = new ChoiceModel((Collection)l);
// cm.setRepeat(Repeat.ZERO_OR_MORE);
// element.setContentModel(cm);
}
private boolean maybeAttlistDecl()
throws IOException, SAXException {
// [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
return false;
// Element element = (Element) elements.get (name);
// if (element == null) {
// // not yet declared -- no problem.
// element = new Element(name);
// elements.put(name, element);
// }
while (!peek(">")) {
// [53] AttDef ::= S Name S AttType S DefaultDecl
// [54] AttType ::= StringType | TokenizedType | EnumeratedType
// look for global attribute definitions, don't expand for now...
char c = getc();
if (c == '%') {
if (entityName != null) {
whitespace("F-021");
continue;
} else
fatal("P-011");
}
ungetc();
// look for attribute name otherwise
}
whitespace("F-001");
/// Attribute a = new Attribute (name);
// Note: use the type constants from Attribute
// so that "==" may be used (faster)
// [55] StringType ::= 'CDATA'
if (peek(TYPE_CDATA))
/// a.setType(Attribute.CDATA);
// [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS'
// | 'ENTITY' | 'ENTITIES'
// | 'NMTOKEN' | 'NMTOKENS'
// n.b. if "IDREFS" is there, both "ID" and "IDREF"
// match peekahead ... so this order matters!
else if (peek(TYPE_IDREFS))
else if (peek(TYPE_IDREF))
// TODO: should implement this error check?
/// if (element.id() != null) {
/// error ("V-016", new Object [] { element.id() });
/// } else
/// element.setId(name);
} else if (peek(TYPE_ENTITY))
else if (peek(TYPE_ENTITIES))
else if (peek(TYPE_NMTOKENS))
else if (peek(TYPE_NMTOKEN))
// [57] EnumeratedType ::= NotationType | Enumeration
// [58] NotationType ::= 'NOTATION' S '(' S? Name
// (S? '|' S? Name)* S? ')'
else if (peek(TYPE_NOTATION)) {
whitespace("F-002");
do {
fatal("P-068");
// permit deferred declarations
if (peek("|"))
} while (!peek(")"));
/// a.setValues(new String [v.size ()]);
/// for (int i = 0; i < v.size (); i++)
/// a.setValue(i, (String)v.elementAt(i));
// [59] Enumeration ::= '(' S? Nmtoken (S? '|' Nmtoken)* S? ')'
} else if (peek("(")) {
/// a.setType(Attribute.ENUMERATION);
/// Vector v = new Vector ();
do {
/// v.addElement (name);
if (peek("|"))
} while (!peek(")"));
/// a.setValues(new String [v.size ()]);
/// for (int i = 0; i < v.size (); i++)
/// a.setValue(i, (String)v.elementAt(i));
} else {
fatal("P-045",
}
short attributeUse;
// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
// | (('#FIXED' S)? AttValue)
whitespace("F-003");
if (peek("#REQUIRED"))
/// a.setIsRequired(true);
else if (peek("#FIXED")) {
/// if (a.type() == Attribute.ID)
/// a.setIsFixed(true);
whitespace("F-004");
parseLiteral(false);
/// if (a.type() != Attribute.CDATA)
/// a.setDefaultValue(normalize(false));
/// else
/// a.setDefaultValue(strTmp.toString());
if (typeName == TYPE_CDATA)
defaultValue = normalize(false);
else
// TODO: implement this check
/// if (a.type() != Attribute.CDATA)
/// validateAttributeSyntax (a, a.defaultValue());
} else if (!peek("#IMPLIED")) {
/// if (a.type() == Attribute.ID)
parseLiteral(false);
/// if (a.type() != Attribute.CDATA)
/// a.setDefaultValue(normalize(false));
/// else
/// a.setDefaultValue(strTmp.toString());
if (typeName == TYPE_CDATA)
defaultValue = normalize(false);
else
// TODO: implement this check
/// if (a.type() != Attribute.CDATA)
/// validateAttributeSyntax (a, a.defaultValue());
} else {
// TODO: this looks like an fatal error.
}
// TODO: isn't it an error to specify the same attribute twice?
/// if (!element.attributes().contains(a)) {
/// element.addAttribute(a);
/// dtdHandler.attributeDecl(a);
/// }
}
return true;
}
// used when parsing literal attribute values,
// or public identifiers.
//
// input in strTmp
// this can allocate an extra string...
boolean didStrip = false;
if (s != s2) {
s = s2;
didStrip = true;
}
strTmp = new StringBuffer();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
continue;
}
didStrip = true;
i--;
}
if (didStrip)
else
return s;
}
private boolean maybeConditionalSect()
throws IOException, SAXException {
// [61] conditionalSect ::= includeSect | ignoreSect
if (!peek("<!["))
return false;
fatal("P-046");
// [62] includeSect ::= '<![' S? 'INCLUDE' S? '['
// extSubsetDecl ']]>'
for (; ;) {
}
if (peek("]]>"))
break;
doLexicalPE = false;
if (maybeWhitespace())
continue;
if (maybePEReference())
continue;
doLexicalPE = true;
if (maybeMarkupDecl() || maybeConditionalSect())
continue;
fatal("P-047");
}
// [63] ignoreSect ::= '<![' S? 'IGNORE' S? '['
// ignoreSectcontents ']]>'
// [64] ignoreSectcontents ::= Ignore ('<!['
// ignoreSectcontents ']]>' Ignore)*
// [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
int nestlevel = 1;
// ignoreSectcontents
doLexicalPE = false;
while (nestlevel > 0) {
char c = getc(); // will pop input entities
if (c == '<') {
if (peek("!["))
nestlevel++;
} else if (c == ']') {
if (peek("]>"))
nestlevel--;
} else
continue;
}
} else
return true;
}
//
// CHAPTER 4: Physical Structures
//
// parse decimal or hex numeric character reference
private int parseCharNumber()
throws IOException, SAXException {
char c;
int retval = 0;
// n.b. we ignore overflow ...
if (getc() != 'x') {
ungetc();
for (; ;) {
c = getc();
if (c >= '0' && c <= '9') {
retval *= 10;
retval += (c - '0');
continue;
}
if (c == ';')
return retval;
fatal("P-049");
}
} else
for (; ;) {
c = getc();
if (c >= '0' && c <= '9') {
retval <<= 4;
retval += (c - '0');
continue;
}
if (c >= 'a' && c <= 'f') {
retval <<= 4;
continue;
}
if (c >= 'A' && c <= 'F') {
retval <<= 4;
continue;
}
if (c == ';')
return retval;
fatal("P-050");
}
}
// parameter is a UCS-4 character ... i.e. not just 16 bit UNICODE,
// though still subject to the 'Char' construct in XML
throws SAXException {
if (ucs4 <= 0xffff) {
return 1;
}
} else if (ucs4 <= 0x0010ffff) {
// we represent these as UNICODE surrogate pairs
ucs4 -= 0x10000;
return 2;
}
// NOTREACHED
return -1;
}
private boolean maybePEReference()
throws IOException, SAXException {
// This is the SYNTACTIC version of this construct.
// When processing external entities, there is also
// a LEXICAL version; see getc() and doLexicalPE.
// [69] PEReference ::= '%' Name ';'
return false;
fatal("P-011");
if (entity instanceof InternalEntity) {
} else if (entity instanceof ExternalEntity) {
}
return true;
}
private boolean maybeEntityDecl()
throws IOException, SAXException {
// [70] EntityDecl ::= GEDecl | PEDecl
// [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
// [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDEF S? '>'
// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
// [74] PEDef ::= EntityValue | ExternalID
//
return false;
boolean doStore;
// PE expansion gets selectively turned off several places:
// in ENTITY declarations (here), in comments, in PIs.
// Here, we allow PE entities to be declared, and allows
// literals to include PE refs without the added spaces
// required with their expansion in markup decls.
doLexicalPE = false;
whitespace("F-005");
whitespace("F-006");
} else
ungetc(); // leave some whitespace
doLexicalPE = true;
whitespace("F-007");
//
// first definition sticks ... e.g. internal subset PEs are used
// to override DTD defaults. It's also an "error" to incorrectly
// redefine builtin internal entities, but since reporting such
// errors is optional we only give warnings ("just in case") for
// non-parameter entities.
//
// internal entities
if (externalId == null) {
char value [];
doLexicalPE = false; // "ab%bar;cd" -maybe-> "abcd"
parseLiteral(true);
doLexicalPE = true;
if (doStore) {
entity.isFromInternalSubset = false;
}
// external entities (including unparsed)
} else {
// [76] NDataDecl ::= S 'NDATA' S Name
&& peek("NDATA")) {
// flag undeclared notation for checking after
// the DTD is fully processed
}
externalId.isFromInternalSubset = false;
if (doStore) {
}
}
return true;
}
throws IOException, SAXException {
// [75] ExternalID ::= 'SYSTEM' S SystemLiteral
// | 'PUBLIC' S' PubidLiteral S Systemliteral
if (peek("PUBLIC")) {
whitespace("F-009");
temp = parsePublicId();
} else if (!peek("SYSTEM"))
return null;
whitespace("F-008");
return retval;
}
throws IOException, SAXException {
// resolve relative URIs ... must do it here since
// it's relative to the source file holding the URI!
// "new java.net.URL (URL, string)" conforms to RFC 1630,
// but we can't use that except when the URI is a URL.
// The entity resolver is allowed to handle URIs that are
// not URLs, so we pass URIs through with scheme intact
uri = ".";
else {
// XXX slashes at the beginning of a relative URI are
// a special case we don't handle.
throw new InternalError();
}
// letting other code map any "/xxx/../" or "/./" to "/",
// since all URIs must handle it the same.
}
// check for fragment ID in URI
return uri;
}
private void maybeTextDecl()
throws IOException, SAXException {
// [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
if (peek("<?xml")) {
readVersion(false, "1.0");
readEncoding(true);
if (!peek("?>"))
fatal("P-057");
}
}
throws IOException, SAXException {
//
// Reap the intended benefits of standalone declarations:
// don't deal with external parameter entities, except to
// validate the standalone declaration.
//
// n.b. "in external parameter entities" (and external
// DTD subset, same grammar) parameter references can
// occur "within" markup declarations ... expansions can
// cross syntax rules. Flagged here; affects getc().
// [79] ExtPE ::= TextDecl? extSubsetDecl
// [31] extSubsetDecl ::= ( markupdecl | conditionalSect
// | PEReference | S )*
// XXX if this returns false ...
continue;
}
doLexicalPE = false;
if (maybeWhitespace())
continue;
if (maybePEReference())
continue;
doLexicalPE = true;
if (maybeMarkupDecl() || maybeConditionalSect())
continue;
break;
}
// if (in != pe) throw new InternalError("who popped my PE?");
}
throws IOException, SAXException {
// [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
return;
if ((c >= 'A' && c <= 'Z')
|| (c >= 'a' && c <= 'z'))
continue;
if (i != 0
&& ((c >= '0' && c <= '9')
|| c == '-'
|| c == '_'
|| c == '.'
))
continue;
}
//
// This should be the encoding in use, and it's even an error for
// it to be anything else (in certain cases that are impractical to
// to test, and may even be insufficient). So, we do the best we
// can, and warn if things look suspicious. Note that Java doesn't
// uniformly expose the encodings, and that the names it uses
// internally are nonstandard. Also, that the XML spec allows
// such "errors" not to be reported at all.
//
if (currentEncoding != null
}
private boolean maybeNotationDecl()
throws IOException, SAXException {
// [82] NotationDecl ::= '<!NOTATION' S Name S
// (ExternalID | PublicID) S? '>'
// [83] PublicID ::= 'PUBLIC' S PubidLiteral
return false;
whitespace("F-011");
if (peek("PUBLIC")) {
whitespace("F-009");
if (maybeWhitespace()) {
if (!peek(">"))
else
ungetc();
}
} else if (peek("SYSTEM")) {
whitespace("F-008");
} else
fatal("P-062");
else {
}
return true;
}
////////////////////////////////////////////////////////////////
//
// UTILITIES
//
////////////////////////////////////////////////////////////////
if (!doLexicalPE) {
return c;
}
//
// External parameter entities get funky processing of '%param;'
// references. It's not clearly defined in the XML spec; but it
// boils down to having those refs be _lexical_ in most cases to
// include partial syntax productions. It also needs selective
// enabling; "<!ENTITY % foo ...>" must work, for example, and
// if "bar" is an empty string PE, "ab%bar;cd" becomes "abcd"
// if it's expanded in a literal, else "ab cd". PEs also do
// not expand within comments or PIs, and external PEs are only
// allowed to have markup decls (and so aren't handled lexically).
//
// This PE handling should be merged into maybeWhitespace, where
// it can be dealt with more consistently.
//
// Also, there are some validity constraints in this area.
//
char c;
else {
}
}
// PE ref ::= '%' name ';'
fatal("P-011");
// push a magic "entity" before and after the
// real one, so ungetc() behaves uniformly
if (entity instanceof InternalEntity)
else if (entity instanceof ExternalEntity)
// PEs can't be unparsed!
// XXX if this returns false ...
// see note in maybePEReference re making this be nonfatal.
fatal("V-022");
else
throw new InternalError();
}
return c;
}
private void ungetc() {
}
throws IOException, SAXException {
}
// Return the entity starting the specified declaration
// (for validating declaration nesting) else null.
throws IOException, SAXException {
return null;
return start;
return null;
}
throws IOException, SAXException {
{new Character(c),
}
throws SAXException {
in = r;
}
throws IOException, SAXException {
InputSource s;
try {
} catch (IOException e) {
throw e;
}
in = r;
return true;
}
}
}
public int getLineNumber() {
}
public int getColumnNumber() {
}
// error handling convenience routines
throws SAXException {
dtdHandler.warning(e);
}
throws SAXException {
dtdHandler.error(e);
}
}
throws SAXException {
dtdHandler.fatalError(e);
throw e;
}
//
// Map char arrays to strings ... cuts down both on memory and
//
// Documents typically repeat names a lot, so we more or less
// intern all the strings within the document; since some strings
// are repeated in multiple documents (e.g. stylesheets) we go
// a bit further, and intern globally.
//
static class NameCache {
//
// Unless we auto-grow this, the default size should be a
// reasonable bit larger than needed for most XML files
// we've yet seen (and be prime). If it's too small, the
// penalty is just excess cache collisions.
//
//
// Usually we just want to get the 'symbol' for these chars
//
}
//
// Sometimes we need to scan the chars in the resulting
// string, so there's an accessor which exposes them.
// (Mostly for element end tags.)
//
int index = 0;
// hashing to get index
for (int i = 0; i < len; i++)
index &= 0x7fffffff;
// return entry if one's there ...
return entry;
}
// else create new one
entry = new NameCacheEntry();
//
// NOTE: JDK 1.1 has a fixed size string intern table,
// with non-GC'd entries. It can panic here; that's a
// JDK problem, use 1.2 or later with many identifiers.
//
return entry;
}
}
static class NameCacheEntry {
char chars [];
return false;
for (int i = 0; i < len; i++)
return false;
return true;
}
}
//
// Message catalog for diagnostics.
//
Catalog() {
super(DTDParser.class);
}
}
}