internal/dtdparser/DTDParser.java

/*
 * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.xml.internal.dtdparser;

import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Set;
import java.util.Vector;

/**
 * This implements parsing of XML 1.0 DTDs.
 * <p/>
 * This conforms to the portion of the XML 1.0 specification related
 * to the external DTD subset.
 * <p/>
 * For multi-language applications (such as web servers using XML
 * processing to create dynamic content), a method supports choosing
 * a locale for parser diagnostics which is both understood by the
 * message recipient and supported by the parser.
 * <p/>
 * This parser produces a stream of parse events.  It supports some
 * features (exposing comments, CDATA sections, and entity references)
 * which are not required to be reported by conformant XML processors.
 *
 * @author David Brownell
 * @author Janet Koenig
 * @author Kohsuke KAWAGUCHI
 * @version $Id: DTDParser.java,v 1.2 2009-04-16 15:25:49 snajper Exp $
 */
public class DTDParser {
    public final static String TYPE_CDATA = "CDATA";
    public final static String TYPE_ID = "ID";
    public final static String TYPE_IDREF = "IDREF";
    public final static String TYPE_IDREFS = "IDREFS";
    public final static String TYPE_ENTITY = "ENTITY";
    public final static String TYPE_ENTITIES = "ENTITIES";
    public final static String TYPE_NMTOKEN = "NMTOKEN";
    public final static String TYPE_NMTOKENS = "NMTOKENS";
    public final static String TYPE_NOTATION = "NOTATION";
    public final static String TYPE_ENUMERATION = "ENUMERATION";


    // stack of input entities being merged
    private InputEntity in;

    // temporaries reused during parsing
    private StringBuffer strTmp;
    private char nameTmp [];
    private NameCache nameCache;
    private char charTmp [] = new char[2];

    // temporary DTD parsing state
    private boolean doLexicalPE;

    // DTD state, used during parsing
//    private SimpleHashtable    elements = new SimpleHashtable (47);
    protected final Set declaredElements = new java.util.HashSet();
    private SimpleHashtable params = new SimpleHashtable(7);

    // exposed to package-private subclass
    Hashtable notations = new Hashtable(7);
    SimpleHashtable entities = new SimpleHashtable(17);

    private SimpleHashtable ids = new SimpleHashtable();

    // listeners for DTD parsing events
    private DTDEventListener dtdHandler;

    private EntityResolver resolver;
    private Locale locale;

    // string constants -- use these copies so "==" works
    // package private
    static final String strANY = "ANY";
    static final String strEMPTY = "EMPTY";

    /**
     * Used by applications to request locale for diagnostics.
     *
     * @param l The locale to use, or null to use system defaults
     *          (which may include only message IDs).
     */
    public void setLocale(Locale l) throws SAXException {

        if (l != null && !messages.isLocaleSupported(l.toString())) {
            throw new SAXException(messages.getMessage(locale,
                    "P-078", new Object[]{l}));
        }
        locale = l;
    }

    /**
     * Returns the diagnostic locale.
     */
    public Locale getLocale() {
        return locale;
    }

    /**
     * Chooses a client locale to use for diagnostics, using the first
     * language specified in the list that is supported by this parser.
     * That locale is then set using <a href="#setLocale(java.util.Locale)">
     * setLocale()</a>.  Such a list could be provided by a variety of user
     * preference mechanisms, including the HTTP <em>Accept-Language</em>
     * header field.
     *
     * @param languages Array of language specifiers, ordered with the most
     *                  preferable one at the front.  For example, "en-ca" then "fr-ca",
     *                  followed by "zh_CN".  Both RFC 1766 and Java styles are supported.
     * @return The chosen locale, or null.
     * @see MessageCatalog
     */
    public Locale chooseLocale(String languages [])
            throws SAXException {

        Locale l = messages.chooseLocale(languages);

        if (l != null) {
            setLocale(l);
        }
        return l;
    }

    /**
     * Lets applications control entity resolution.
     */
    public void setEntityResolver(EntityResolver r) {

        resolver = r;
    }

    /**
     * Returns the object used to resolve entities
     */
    public EntityResolver getEntityResolver() {

        return resolver;
    }

    /**
     * Used by applications to set handling of DTD parsing events.
     */
    public void setDtdHandler(DTDEventListener handler) {
        dtdHandler = handler;
        if (handler != null)
            handler.setDocumentLocator(new Locator() {
                public String getPublicId() {
                    return DTDParser.this.getPublicId();
                }

                public String getSystemId() {
                    return DTDParser.this.getSystemId();
                }

                public int getLineNumber() {
                    return DTDParser.this.getLineNumber();
                }

                public int getColumnNumber() {
                    return DTDParser.this.getColumnNumber();
                }
            });
    }

    /**
     * Returns the handler used to for DTD parsing events.
     */
    public DTDEventListener getDtdHandler() {
        return dtdHandler;
    }

    /**
     * Parse a DTD.
     */
    public void parse(InputSource in)
            throws IOException, SAXException {
        init();
        parseInternal(in);
    }

    /**
     * Parse a DTD.
     */
    public void parse(String uri)
            throws IOException, SAXException {
        InputSource inSource;

        init();
        // System.out.println ("parse (\"" + uri + "\")");
        inSource = resolver.resolveEntity(null, uri);

        // If custom resolver punts resolution to parser, handle it ...
        if (inSource == null) {
            inSource = Resolver.createInputSource(new java.net.URL(uri), false);

            // ... or if custom resolver doesn't correctly construct the
            // input entity, patch it up enough so relative URIs work, and
            // issue a warning to minimize later confusion.
        } else if (inSource.getSystemId() == null) {
            warning("P-065", null);
            inSource.setSystemId(uri);
        }

        parseInternal(inSource);
    }

    // makes sure the parser is reset to "before a document"
    private void init() {
        in = null;

        // alloc temporary data used in parsing
        strTmp = new StringBuffer();
        nameTmp = new char[20];
        nameCache = new NameCache();

        // reset doc info
//        isInAttribute = false;

        doLexicalPE = false;

        entities.clear();
        notations.clear();
        params.clear();
        //    elements.clear ();
        declaredElements.clear();

        // initialize predefined references ... re-interpreted later
        builtin("amp", "&#38;");
        builtin("lt", "&#60;");
        builtin("gt", ">");
        builtin("quot", "\"");
        builtin("apos", "'");

        if (locale == null)
            locale = Locale.getDefault();
        if (resolver == null)
            resolver = new Resolver();
        if (dtdHandler == null)
            dtdHandler = new DTDHandlerBase();
    }

    private void builtin(String entityName, String entityValue) {
        InternalEntity entity;
        entity = new InternalEntity(entityName, entityValue.toCharArray());
        entities.put(entityName, entity);
    }


    ////////////////////////////////////////////////////////////////
    //
    // parsing is by recursive descent, code roughly
    // following the BNF rules except tweaked for simple
    // lookahead.  rules are more or less in numeric order,
    // except where code sharing suggests other structures.
    //
    // a classic benefit of recursive descent parsers:  it's
    // relatively easy to get diagnostics that make sense.
    //
    ////////////////////////////////////////////////////////////////


    private void parseInternal(InputSource input)
            throws IOException, SAXException {

        if (input == null)
            fatal("P-000");

        try {
            in = InputEntity.getInputEntity(dtdHandler, locale);
            in.init(input, null, null, false);

            dtdHandler.startDTD(in);

            // [30] extSubset ::= TextDecl? extSubsetDecl
            // [31] extSubsetDecl ::= ( markupdecl | conditionalSect
            //        | PEReference | S )*
            //    ... same as [79] extPE, which is where the code is

            ExternalEntity externalSubset = new ExternalEntity(in);
            externalParameterEntity(externalSubset);

            if (!in.isEOF()) {
                fatal("P-001", new Object[]
                {Integer.toHexString(((int) getc()))});
            }
            afterRoot();
            dtdHandler.endDTD();

        } catch (EndOfInputException e) {
            if (!in.isDocument()) {
                String name = in.getName();
                do {    // force a relevant URI and line number
                    in = in.pop();
                } while (in.isInternal());
                fatal("P-002", new Object[]{name});
            } else {
                fatal("P-003", null);
            }
        } catch (RuntimeException e) {
            // Don't discard location that triggered the exception
            // ## Should properly wrap exception
            System.err.print("Internal DTD parser error: "); // ##
            e.printStackTrace();
            throw new SAXParseException(e.getMessage() != null
                    ? e.getMessage() : e.getClass().getName(),
                    getPublicId(), getSystemId(),
                    getLineNumber(), getColumnNumber());

        } finally {
            // recycle temporary data used during parsing
            strTmp = null;
            nameTmp = null;
            nameCache = null;

            // ditto input sources etc
            if (in != null) {
                in.close();
                in = null;
            }

            // get rid of all DTD info ... some of it would be
            // useful for editors etc, investigate later.

            params.clear();
            entities.clear();
            notations.clear();
            declaredElements.clear();
//        elements.clear();
            ids.clear();
        }
    }

    void afterRoot() throws SAXException {
        // Make sure all IDREFs match declared ID attributes.  We scan
        // after the document element is parsed, since XML allows forward
        // references, and only now can we know if they're all resolved.

        for (Enumeration e = ids.keys();
             e.hasMoreElements();
                ) {
            String id = (String) e.nextElement();
            Boolean value = (Boolean) ids.get(id);
            if (Boolean.FALSE == value)
                error("V-024", new Object[]{id});
        }
    }


    // role is for diagnostics
    private void whitespace(String roleId)
            throws IOException, SAXException {

        // [3] S ::= (#x20 | #x9 | #xd | #xa)+
        if (!maybeWhitespace()) {
            fatal("P-004", new Object[]
            {messages.getMessage(locale, roleId)});
        }
    }

    // S?
    private boolean maybeWhitespace()
            throws IOException, SAXException {

        if (!doLexicalPE)
            return in.maybeWhitespace();

        // see getc() for the PE logic -- this lets us splice
        // expansions of PEs in "anywhere".  getc() has smarts,
        // so for external PEs we don't bypass it.

        // XXX we can marginally speed PE handling, and certainly
        // be cleaner (hence potentially more correct), by using
        // the observations that expanded PEs only start and stop
        // where whitespace is allowed.  getc wouldn't need any
        // "lexical" PE expansion logic, and no other method needs
        // to handle termination of PEs.  (parsing of literals would
        // still need to pop entities, but not parsing of references
        // in content.)

        char c = getc();
        boolean saw = false;

        while (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
            saw = true;

            // this gracefully ends things when we stop playing
            // with internal parameters.  caller should have a
            // grammar rule allowing whitespace at end of entity.
            if (in.isEOF() && !in.isInternal())
                return saw;
            c = getc();
        }
        ungetc();
        return saw;
    }

    private String maybeGetName()
            throws IOException, SAXException {

        NameCacheEntry entry = maybeGetNameCacheEntry();
        return (entry == null) ? null : entry.name;
    }

    private NameCacheEntry maybeGetNameCacheEntry()
            throws IOException, SAXException {

        // [5] Name ::= (Letter|'_'|':') (Namechar)*
        char c = getc();

        if (!XmlChars.isLetter(c) && c != ':' && c != '_') {
            ungetc();
            return null;
        }
        return nameCharString(c);
    }

    // Used when parsing enumerations
    private String getNmtoken()
            throws IOException, SAXException {

        // [7] Nmtoken ::= (Namechar)+
        char c = getc();
        if (!XmlChars.isNameChar(c))
            fatal("P-006", new Object[]{new Character(c)});
        return nameCharString(c).name;
    }

    // n.b. this gets used when parsing attribute values (for
    // internal references) so we can't use strTmp; it's also
    // a hotspot for CPU and memory in the parser (called at least
    // once for each element) so this has been optimized a bit.

    private NameCacheEntry nameCharString(char c)
            throws IOException, SAXException {

        int i = 1;

        nameTmp[0] = c;
        for (; ;) {
            if ((c = in.getNameChar()) == 0)
                break;
            if (i >= nameTmp.length) {
                char tmp [] = new char[nameTmp.length + 10];
                System.arraycopy(nameTmp, 0, tmp, 0, nameTmp.length);
                nameTmp = tmp;
            }
            nameTmp[i++] = c;
        }
        return nameCache.lookupEntry(nameTmp, i);
    }

    //
    // much similarity between parsing entity values in DTD
    // and attribute values (in DTD or content) ... both follow
    // literal parsing rules, newline canonicalization, etc
    //
    // leaves value in 'strTmp' ... either a "replacement text" (4.5),
    // or else partially normalized attribute value (the first bit
    // of 3.3.3's spec, without the "if not CDATA" bits).
    //
    private void parseLiteral(boolean isEntityValue)
            throws IOException, SAXException {

        // [9] EntityValue ::=
        //    '"' ([^"&%] | Reference | PEReference)* '"'
        //    |    "'" ([^'&%] | Reference | PEReference)* "'"
        // [10] AttValue ::=
        //    '"' ([^"&]  | Reference             )* '"'
        //    |    "'" ([^'&]  | Reference             )* "'"
        char quote = getc();
        char c;
        InputEntity source = in;

        if (quote != '\'' && quote != '"') {
            fatal("P-007");
        }

        // don't report entity expansions within attributes,
        // they're reported "fully expanded" via SAX
//    isInAttribute = !isEntityValue;

        // get value into strTmp
        strTmp = new StringBuffer();

        // scan, allowing entity push/pop wherever ...
        // expanded entities can't terminate the literal!
        for (; ;) {
            if (in != source && in.isEOF()) {
                // we don't report end of parsed entities
                // within attributes (no SAX hooks)
                in = in.pop();
                continue;
            }
            if ((c = getc()) == quote && in == source) {
                break;
            }

            //
            // Basically the "reference in attribute value"
            // row of the chart in section 4.4 of the spec
            //
            if (c == '&') {
                String entityName = maybeGetName();

                if (entityName != null) {
                    nextChar(';', "F-020", entityName);

                    // 4.4 says:  bypass these here ... we'll catch
                    // forbidden refs to unparsed entities on use
                    if (isEntityValue) {
                        strTmp.append('&');
                        strTmp.append(entityName);
                        strTmp.append(';');
                        continue;
                    }
                    expandEntityInLiteral(entityName, entities, isEntityValue);


                    // character references are always included immediately
                } else if ((c = getc()) == '#') {
                    int tmp = parseCharNumber();

                    if (tmp > 0xffff) {
                        tmp = surrogatesToCharTmp(tmp);
                        strTmp.append(charTmp[0]);
                        if (tmp == 2)
                            strTmp.append(charTmp[1]);
                    } else
                        strTmp.append((char) tmp);
                } else
                    fatal("P-009");
                continue;

            }

            // expand parameter entities only within entity value literals
            if (c == '%' && isEntityValue) {
                String entityName = maybeGetName();

                if (entityName != null) {
                    nextChar(';', "F-021", entityName);
                    expandEntityInLiteral(entityName, params, isEntityValue);
                    continue;
                } else
                    fatal("P-011");
            }

            // For attribute values ...
            if (!isEntityValue) {
                // 3.3.3 says whitespace normalizes to space...
                if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
                    strTmp.append(' ');
                    continue;
                }

                // "<" not legal in parsed literals ...
                if (c == '<')
                    fatal("P-012");
            }

            strTmp.append(c);
        }
//    isInAttribute = false;
    }

    // does a SINGLE expansion of the entity (often reparsed later)
    private void expandEntityInLiteral(String name, SimpleHashtable table,
                                       boolean isEntityValue)
            throws IOException, SAXException {

        Object entity = table.get(name);

        if (entity instanceof InternalEntity) {
            InternalEntity value = (InternalEntity) entity;
            pushReader(value.buf, name, !value.isPE);

        } else if (entity instanceof ExternalEntity) {
            if (!isEntityValue)    // must be a PE ...
                fatal("P-013", new Object[]{name});
            // XXX if this returns false ...
            pushReader((ExternalEntity) entity);

        } else if (entity == null) {
            //
            // Note:  much confusion about whether spec requires such
            // errors to be fatal in many cases, but none about whether
            // it allows "normal" errors to be unrecoverable!
            //
            fatal((table == params) ? "V-022" : "P-014",
                    new Object[]{name});
        }
    }

    // [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
    // for PUBLIC and SYSTEM literals, also "<?xml ...type='literal'?>'

    // NOTE:  XML spec should explicitly say that PE ref syntax is
    // ignored in PIs, comments, SystemLiterals, and Pubid Literal
    // values ... can't process the XML spec's own DTD without doing
    // that for comments.

    private String getQuotedString(String type, String extra)
            throws IOException, SAXException {

        // use in.getc to bypass PE processing
        char quote = in.getc();

        if (quote != '\'' && quote != '"')
            fatal("P-015", new Object[]{
                messages.getMessage(locale, type, new Object[]{extra})
            });

        char c;

        strTmp = new StringBuffer();
        while ((c = in.getc()) != quote)
            strTmp.append((char) c);
        return strTmp.toString();
    }


    private String parsePublicId() throws IOException, SAXException {

        // [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'")
        // [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%]
        String retval = getQuotedString("F-033", null);
        for (int i = 0; i < retval.length(); i++) {
            char c = retval.charAt(i);
            if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1
                    && !(c >= 'A' && c <= 'Z')
                    && !(c >= 'a' && c <= 'z'))
                fatal("P-016", new Object[]{new Character(c)});
        }
        strTmp = new StringBuffer();
        strTmp.append(retval);
        return normalize(false);
    }

    // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
    // handled by:  InputEntity.parsedContent()

    private boolean maybeComment(boolean skipStart)
            throws IOException, SAXException {

        // [15] Comment ::= '<!--'
        //        ( (Char - '-') | ('-' (Char - '-'))*
        //        '-->'
        if (!in.peek(skipStart ? "!--" : "<!--", null))
            return false;

        boolean savedLexicalPE = doLexicalPE;
        boolean saveCommentText;

        doLexicalPE = false;
        saveCommentText = false;
        if (saveCommentText)
            strTmp = new StringBuffer();

        oneComment:
        for (; ;) {
            try {
                // bypass PE expansion, but permit PEs
                // to complete ... valid docs won't care.
                for (; ;) {
                    int c = getc();
                    if (c == '-') {
                        c = getc();
                        if (c != '-') {
                            if (saveCommentText)
                                strTmp.append('-');
                            ungetc();
                            continue;
                        }
                        nextChar('>', "F-022", null);
                        break oneComment;
                    }
                    if (saveCommentText)
                        strTmp.append((char) c);
                }
            } catch (EndOfInputException e) {
                //
                // This is fatal EXCEPT when we're processing a PE...
                // in which case a validating processor reports an error.
                // External PEs are easy to detect; internal ones we
                // infer by being an internal entity outside an element.
                //
                if (in.isInternal()) {
                    error("V-021", null);
                }
                fatal("P-017");
            }
        }
        doLexicalPE = savedLexicalPE;
        if (saveCommentText)
            dtdHandler.comment(strTmp.toString());
        return true;
    }

    private boolean maybePI(boolean skipStart)
            throws IOException, SAXException {

        // [16] PI ::= '<?' PITarget
        //        (S (Char* - (Char* '?>' Char*)))?
        //        '?>'
        // [17] PITarget ::= Name - (('X'|'x')('M'|'m')('L'|'l')
        boolean savedLexicalPE = doLexicalPE;

        if (!in.peek(skipStart ? "?" : "<?", null))
            return false;
        doLexicalPE = false;

        String target = maybeGetName();

        if (target == null) {
            fatal("P-018");
        }
        if ("xml".equals(target)) {
            fatal("P-019");
        }
        if ("xml".equalsIgnoreCase(target)) {
            fatal("P-020", new Object[]{target});
        }

        if (maybeWhitespace()) {
            strTmp = new StringBuffer();
            try {
                for (; ;) {
                    // use in.getc to bypass PE processing
                    char c = in.getc();
                    //Reached the end of PI.
                    if (c == '?' && in.peekc('>'))
                        break;
                    strTmp.append(c);
                }
            } catch (EndOfInputException e) {
                fatal("P-021");
            }
            dtdHandler.processingInstruction(target, strTmp.toString());
        } else {
            if (!in.peek("?>", null)) {
                fatal("P-022");
            }
            dtdHandler.processingInstruction(target, "");
        }

        doLexicalPE = savedLexicalPE;
        return true;
    }

    // [18] CDSect ::= CDStart CData CDEnd
    // [19] CDStart ::= '<![CDATA['
    // [20] CData ::= (Char* - (Char* ']]>' Char*))
    // [21] CDEnd ::= ']]>'
    //
    //    ... handled by InputEntity.unparsedContent()

    // collapsing several rules together ...
    // simpler than attribute literals -- no reference parsing!
    private String maybeReadAttribute(String name, boolean must)
            throws IOException, SAXException {

        // [24] VersionInfo ::= S 'version' Eq \'|\" versionNum \'|\"
        // [80] EncodingDecl ::= S 'encoding' Eq \'|\" EncName \'|\"
        // [32] SDDecl ::=  S 'standalone' Eq \'|\" ... \'|\"
        if (!maybeWhitespace()) {
            if (!must) {
                return null;
            }
            fatal("P-024", new Object[]{name});
            // NOTREACHED
        }

        if (!peek(name)) {
            if (must) {
                fatal("P-024", new Object[]{name});
            } else {
                // To ensure that the whitespace is there so that when we
                // check for the next attribute we assure that the
                // whitespace still exists.
                ungetc();
                return null;
            }
        }

        // [25] Eq ::= S? '=' S?
        maybeWhitespace();
        nextChar('=', "F-023", null);
        maybeWhitespace();

        return getQuotedString("F-035", name);
    }

    private void readVersion(boolean must, String versionNum)
            throws IOException, SAXException {

        String value = maybeReadAttribute("version", must);

        // [26] versionNum ::= ([a-zA-Z0-9_.:]| '-')+

        if (must && value == null)
            fatal("P-025", new Object[]{versionNum});
        if (value != null) {
            int length = value.length();
            for (int i = 0; i < length; i++) {
                char c = value.charAt(i);
                if (!((c >= '0' && c <= '9')
                        || c == '_' || c == '.'
                        || (c >= 'a' && c <= 'z')
                        || (c >= 'A' && c <= 'Z')
                        || c == ':' || c == '-')
                )
                    fatal("P-026", new Object[]{value});
            }
        }
        if (value != null && !value.equals(versionNum))
            error("P-027", new Object[]{versionNum, value});
    }

    // common code used by most markup declarations
    // ... S (Q)Name ...
    private String getMarkupDeclname(String roleId, boolean qname)
            throws IOException, SAXException {

        String name;

        whitespace(roleId);
        name = maybeGetName();
        if (name == null)
            fatal("P-005", new Object[]
            {messages.getMessage(locale, roleId)});
        return name;
    }

    private boolean maybeMarkupDecl()
            throws IOException, SAXException {

        // [29] markupdecl ::= elementdecl | Attlistdecl
        //           | EntityDecl | NotationDecl | PI | Comment
        return maybeElementDecl()
                || maybeAttlistDecl()
                || maybeEntityDecl()
                || maybeNotationDecl()
                || maybePI(false)
                || maybeComment(false);
    }

    private static final String XmlLang = "xml:lang";

    private boolean isXmlLang(String value) {

        // [33] LanguageId ::= Langcode ('-' Subcode)*
        // [34] Langcode ::= ISO639Code | IanaCode | UserCode
        // [35] ISO639Code ::= [a-zA-Z] [a-zA-Z]
        // [36] IanaCode ::= [iI] '-' SubCode
        // [37] UserCode ::= [xX] '-' SubCode
        // [38] SubCode ::= [a-zA-Z]+

        // the ISO and IANA codes (and subcodes) are registered,
        // but that's neither a WF nor a validity constraint.

        int nextSuffix;
        char c;

        if (value.length() < 2)
            return false;
        c = value.charAt(1);
        if (c == '-') {        // IANA, or user, code
            c = value.charAt(0);
            if (!(c == 'i' || c == 'I' || c == 'x' || c == 'X'))
                return false;
            nextSuffix = 1;
        } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
            // 2 letter ISO code, or error
            c = value.charAt(0);
            if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
                return false;
            nextSuffix = 2;
        } else
            return false;

        // here "suffix" ::= '-' [a-zA-Z]+ suffix*
        while (nextSuffix < value.length()) {
            c = value.charAt(nextSuffix);
            if (c != '-')
                break;
            while (++nextSuffix < value.length()) {
                c = value.charAt(nextSuffix);
                if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
                    break;
            }
        }
        return value.length() == nextSuffix && c != '-';
    }


    //
    // CHAPTER 3:  Logical Structures
    //

    /**
     * To validate, subclassers should at this time make sure that
     * values are of the declared types:<UL>
     * <LI> ID and IDREF(S) values are Names
     * <LI> NMTOKEN(S) are Nmtokens
     * <LI> ENUMERATION values match one of the tokens
     * <LI> NOTATION values match a notation name
     * <LI> ENTITIY(IES) values match an unparsed external entity
     * </UL>
     * <p/>
     * <P> Separately, make sure IDREF values match some ID
     * provided in the document (in the afterRoot method).
     */
/*    void validateAttributeSyntax (Attribute attr, String value)
         throws DTDParseException {
        // ID, IDREF(S) ... values are Names
        if (Attribute.ID == attr.type()) {
            if (!XmlNames.isName (value))
                error ("V-025", new Object [] { value });

            Boolean             b = (Boolean) ids.getNonInterned (value);
            if (b == null || b.equals (Boolean.FALSE))
                ids.put (value.intern (), Boolean.TRUE);
            else
                error ("V-026", new Object [] { value });

        } else if (Attribute.IDREF == attr.type()) {
            if (!XmlNames.isName (value))
                error ("V-027", new Object [] { value });

            Boolean             b = (Boolean) ids.getNonInterned (value);
            if (b == null)
                ids.put (value.intern (), Boolean.FALSE);

        } else if (Attribute.IDREFS == attr.type()) {
            StringTokenizer     tokenizer = new StringTokenizer (value);
            Boolean             b;
            boolean             sawValue = false;

            while (tokenizer.hasMoreTokens ()) {
                value = tokenizer.nextToken ();
                if (!XmlNames.isName (value))
                    error ("V-027", new Object [] { value });
                b = (Boolean) ids.getNonInterned (value);
                if (b == null)
                    ids.put (value.intern (), Boolean.FALSE);
                sawValue = true;
            }
            if (!sawValue)
                error ("V-039", null);


        // NMTOKEN(S) ... values are Nmtoken(s)
        } else if (Attribute.NMTOKEN == attr.type()) {
            if (!XmlNames.isNmtoken (value))
                error ("V-028", new Object [] { value });

        } else if (Attribute.NMTOKENS == attr.type()) {
            StringTokenizer     tokenizer = new StringTokenizer (value);
            boolean             sawValue = false;

            while (tokenizer.hasMoreTokens ()) {
                value = tokenizer.nextToken ();
                if (!XmlNames.isNmtoken (value))
                    error ("V-028", new Object [] { value });
                sawValue = true;
            }
            if (!sawValue)
                error ("V-032", null);

        // ENUMERATION ... values match one of the tokens
        } else if (Attribute.ENUMERATION == attr.type()) {
            for (int i = 0; i < attr.values().length; i++)
                if (value.equals (attr.values()[i]))
                    return;
            error ("V-029", new Object [] { value });

        // NOTATION values match a notation name
        } else if (Attribute.NOTATION == attr.type()) {
            //
            // XXX XML 1.0 spec should probably list references to
            // externally defined notations in standalone docs as
            // validity errors.  Ditto externally defined unparsed
            // entities; neither should show up in attributes, else
            // one needs to read the external declarations in order
            // to make sense of the document (exactly what tagging
            // a doc as "standalone" intends you won't need to do).
            //
            for (int i = 0; i < attr.values().length; i++)
                if (value.equals (attr.values()[i]))
                    return;
            error ("V-030", new Object [] { value });

        // ENTITY(IES) values match an unparsed entity(ies)
        } else if (Attribute.ENTITY == attr.type()) {
            // see note above re standalone
            if (!isUnparsedEntity (value))
                error ("V-031", new Object [] { value });

        } else if (Attribute.ENTITIES == attr.type()) {
            StringTokenizer     tokenizer = new StringTokenizer (value);
            boolean             sawValue = false;

            while (tokenizer.hasMoreTokens ()) {
                value = tokenizer.nextToken ();
                // see note above re standalone
                if (!isUnparsedEntity (value))
                    error ("V-031", new Object [] { value });
                sawValue = true;
            }
            if (!sawValue)
                error ("V-040", null);

        } else if (Attribute.CDATA != attr.type())
            throw new InternalError (attr.type());
    }
*/
/*
    private boolean isUnparsedEntity (String name)
    {
        Object e = entities.getNonInterned (name);
        if (e == null || !(e instanceof ExternalEntity))
            return false;
        return ((ExternalEntity)e).notation != null;
    }
*/
    private boolean maybeElementDecl()
            throws IOException, SAXException {

        // [45] elementDecl ::= '<!ELEMENT' S Name S contentspec S? '>'
        // [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
        InputEntity start = peekDeclaration("!ELEMENT");

        if (start == null)
            return false;

        // n.b. for content models where inter-element whitespace is
        // ignorable, we mark that fact here.
        String name = getMarkupDeclname("F-015", true);
//    Element        element = (Element) elements.get (name);
//    boolean        declEffective = false;

/*
    if (element != null) {
        if (element.contentModel() != null) {
            error ("V-012", new Object [] { name });
        } // else <!ATTLIST name ...> came first
    } else {
        element = new Element(name);
        elements.put (element.name(), element);
        declEffective = true;
    }
*/
        if (declaredElements.contains(name))
            error("V-012", new Object[]{name});
        else {
            declaredElements.add(name);
//        declEffective = true;
        }

        short modelType;
        whitespace("F-000");
        if (peek(strEMPTY)) {
///        // leave element.contentModel as null for this case.
            dtdHandler.startContentModel(name, modelType = DTDEventListener.CONTENT_MODEL_EMPTY);
        } else if (peek(strANY)) {
///        element.setContentModel(new StringModel(StringModelType.ANY));
            dtdHandler.startContentModel(name, modelType = DTDEventListener.CONTENT_MODEL_ANY);
        } else {
            modelType = getMixedOrChildren(name);
        }

        dtdHandler.endContentModel(name, modelType);

        maybeWhitespace();
        char c = getc();
        if (c != '>')
            fatal("P-036", new Object[]{name, new Character(c)});
        if (start != in)
            error("V-013", null);

///        dtdHandler.elementDecl(element);

        return true;
    }

    // We're leaving the content model as a regular expression;
    // it's an efficient natural way to express such things, and
    // libraries often interpret them.  No whitespace in the
    // model we store, though!

    /**
     * returns content model type.
     */
    private short getMixedOrChildren(String elementName/*Element element*/)
            throws IOException, SAXException {

        InputEntity start;

        // [47] children ::= (choice|seq) ('?'|'*'|'+')?
        strTmp = new StringBuffer();

        nextChar('(', "F-028", elementName);
        start = in;
        maybeWhitespace();
        strTmp.append('(');

        short modelType;
        if (peek("#PCDATA")) {
            strTmp.append("#PCDATA");
            dtdHandler.startContentModel(elementName, modelType = DTDEventListener.CONTENT_MODEL_MIXED);
            getMixed(elementName, start);
        } else {
            dtdHandler.startContentModel(elementName, modelType = DTDEventListener.CONTENT_MODEL_CHILDREN);
            getcps(elementName, start);
        }

        return modelType;
    }

    // '(' S? already consumed
    // matching ')' must be in "start" entity if validating
    private void getcps(/*Element element,*/String elementName, InputEntity start)
            throws IOException, SAXException {

        // [48] cp ::= (Name|choice|seq) ('?'|'*'|'+')?
        // [49] choice ::= '(' S? cp (S? '|' S? cp)* S? ')'
        // [50] seq    ::= '(' S? cp (S? ',' S? cp)* S? ')'
        boolean decided = false;
        char type = 0;
//        ContentModel       retval, temp, current;

//        retval = temp = current = null;

        dtdHandler.startModelGroup();

        do {
            String tag;

            tag = maybeGetName();
            if (tag != null) {
                strTmp.append(tag);
//                temp = new ElementModel(tag);
//                getFrequency((RepeatableContent)temp);
///->
                dtdHandler.childElement(tag, getFrequency());
///<-
            } else if (peek("(")) {
                InputEntity next = in;
                strTmp.append('(');
                maybeWhitespace();
//                temp = getcps(element, next);
//                getFrequency(temp);
///->
                getcps(elementName, next);
///                getFrequency();        <- this looks like a bug
///<-
            } else
                fatal((type == 0) ? "P-039" :
                        ((type == ',') ? "P-037" : "P-038"),
                        new Object[]{new Character(getc())});

            maybeWhitespace();
            if (decided) {
                char c = getc();

//                if (current != null) {
//                    current.addChild(temp);
//                }
                if (c == type) {
                    strTmp.append(type);
                    maybeWhitespace();
                    reportConnector(type);
                    continue;
                } else if (c == '\u0029') {    // rparen
                    ungetc();
                    continue;
                } else {
                    fatal((type == 0) ? "P-041" : "P-040",
                            new Object[]{
                                new Character(c),
                                new Character(type)
                            });
                }
            } else {
                type = getc();
                switch (type) {
                case '|':
                case ',':
                    reportConnector(type);
                    break;
                default:
//                        retval = temp;
                    ungetc();
                    continue;
                }
//                retval = (ContentModel)current;
                decided = true;
//                current.addChild(temp);
                strTmp.append(type);
            }
            maybeWhitespace();
        } while (!peek(")"));

        if (in != start)
            error("V-014", new Object[]{elementName});
        strTmp.append(')');

        dtdHandler.endModelGroup(getFrequency());
//        return retval;
    }

    private void reportConnector(char type) throws SAXException {
        switch (type) {
        case '|':
            dtdHandler.connector(DTDEventListener.CHOICE);    ///<-
            return;
        case ',':
            dtdHandler.connector(DTDEventListener.SEQUENCE); ///<-
            return;
        default:
            throw new Error();    //assertion failed.
        }
    }

    private short getFrequency()
            throws IOException, SAXException {

        final char c = getc();

        if (c == '?') {
            strTmp.append(c);
            return DTDEventListener.OCCURENCE_ZERO_OR_ONE;
            //        original.setRepeat(Repeat.ZERO_OR_ONE);
        } else if (c == '+') {
            strTmp.append(c);
            return DTDEventListener.OCCURENCE_ONE_OR_MORE;
            //        original.setRepeat(Repeat.ONE_OR_MORE);
        } else if (c == '*') {
            strTmp.append(c);
            return DTDEventListener.OCCURENCE_ZERO_OR_MORE;
            //        original.setRepeat(Repeat.ZERO_OR_MORE);
        } else {
            ungetc();
            return DTDEventListener.OCCURENCE_ONCE;
        }
    }

    // '(' S? '#PCDATA' already consumed
    // matching ')' must be in "start" entity if validating
    private void getMixed(String elementName, /*Element element,*/ InputEntity start)
            throws IOException, SAXException {

        // [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
        //        | '(' S? '#PCDATA'                   S? ')'
        maybeWhitespace();
        if (peek("\u0029*") || peek("\u0029")) {
            if (in != start)
                error("V-014", new Object[]{elementName});
            strTmp.append(')');
//            element.setContentModel(new StringModel(StringModelType.PCDATA));
            return;
        }

        ArrayList l = new ArrayList();
//    l.add(new StringModel(StringModelType.PCDATA));


        while (peek("|")) {
            String name;

            strTmp.append('|');
            maybeWhitespace();

            doLexicalPE = true;
            name = maybeGetName();
            if (name == null)
                fatal("P-042", new Object[]
                {elementName, Integer.toHexString(getc())});
            if (l.contains(name)) {
                error("V-015", new Object[]{name});
            } else {
                l.add(name);
                dtdHandler.mixedElement(name);
            }
            strTmp.append(name);
            maybeWhitespace();
        }

        if (!peek("\u0029*"))    // right paren
            fatal("P-043", new Object[]
            {elementName, new Character(getc())});
        if (in != start)
            error("V-014", new Object[]{elementName});
        strTmp.append(')');
//        ChoiceModel cm = new ChoiceModel((Collection)l);
//    cm.setRepeat(Repeat.ZERO_OR_MORE);
//       element.setContentModel(cm);
    }

    private boolean maybeAttlistDecl()
            throws IOException, SAXException {

        // [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
        InputEntity start = peekDeclaration("!ATTLIST");

        if (start == null)
            return false;

        String elementName = getMarkupDeclname("F-016", true);
//    Element    element = (Element) elements.get (name);

//    if (element == null) {
//        // not yet declared -- no problem.
//        element = new Element(name);
//        elements.put(name, element);
//    }

        while (!peek(">")) {

            // [53] AttDef ::= S Name S AttType S DefaultDecl
            // [54] AttType ::= StringType | TokenizedType | EnumeratedType

            // look for global attribute definitions, don't expand for now...
            maybeWhitespace();
            char c = getc();
            if (c == '%') {
                String entityName = maybeGetName();
                if (entityName != null) {
                    nextChar(';', "F-021", entityName);
                    whitespace("F-021");
                    continue;
                } else
                    fatal("P-011");
            }

            ungetc();
            // look for attribute name otherwise
            String attName = maybeGetName();
            if (attName == null) {
                fatal("P-044", new Object[]{new Character(getc())});
            }
            whitespace("F-001");

///        Attribute    a = new Attribute (name);

            String typeName;
            Vector values = null;    // notation/enumeration values

            // Note:  use the type constants from Attribute
            // so that "==" may be used (faster)

            // [55] StringType ::= 'CDATA'
            if (peek(TYPE_CDATA))
///            a.setType(Attribute.CDATA);
                typeName = TYPE_CDATA;

            // [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS'
            //        | 'ENTITY' | 'ENTITIES'
            //        | 'NMTOKEN' | 'NMTOKENS'
            // n.b. if "IDREFS" is there, both "ID" and "IDREF"
            // match peekahead ... so this order matters!
            else if (peek(TYPE_IDREFS))
                typeName = TYPE_IDREFS;
            else if (peek(TYPE_IDREF))
                typeName = TYPE_IDREF;
            else if (peek(TYPE_ID)) {
                typeName = TYPE_ID;
// TODO: should implement this error check?
///        if (element.id() != null) {
///                    error ("V-016", new Object [] { element.id() });
///        } else
///            element.setId(name);
            } else if (peek(TYPE_ENTITY))
                typeName = TYPE_ENTITY;
            else if (peek(TYPE_ENTITIES))
                typeName = TYPE_ENTITIES;
            else if (peek(TYPE_NMTOKENS))
                typeName = TYPE_NMTOKENS;
            else if (peek(TYPE_NMTOKEN))
                typeName = TYPE_NMTOKEN;

            // [57] EnumeratedType ::= NotationType | Enumeration
            // [58] NotationType ::= 'NOTATION' S '(' S? Name
            //        (S? '|' S? Name)* S? ')'
            else if (peek(TYPE_NOTATION)) {
                typeName = TYPE_NOTATION;
                whitespace("F-002");
                nextChar('(', "F-029", null);
                maybeWhitespace();

                values = new Vector();
                do {
                    String name;
                    if ((name = maybeGetName()) == null)
                        fatal("P-068");
                    // permit deferred declarations
                    if (notations.get(name) == null)
                        notations.put(name, name);
                    values.addElement(name);
                    maybeWhitespace();
                    if (peek("|"))
                        maybeWhitespace();
                } while (!peek(")"));
///            a.setValues(new String [v.size ()]);
///            for (int i = 0; i < v.size (); i++)
///                a.setValue(i, (String)v.elementAt(i));

                // [59] Enumeration ::= '(' S? Nmtoken (S? '|' Nmtoken)* S? ')'
            } else if (peek("(")) {
///            a.setType(Attribute.ENUMERATION);
                typeName = TYPE_ENUMERATION;

                maybeWhitespace();

///            Vector v = new Vector ();
                values = new Vector();
                do {
                    String name = getNmtoken();
///                v.addElement (name);
                    values.addElement(name);
                    maybeWhitespace();
                    if (peek("|"))
                        maybeWhitespace();
                } while (!peek(")"));
///            a.setValues(new String [v.size ()]);
///            for (int i = 0; i < v.size (); i++)
///                a.setValue(i, (String)v.elementAt(i));
            } else {
                fatal("P-045",
                        new Object[]{attName, new Character(getc())});
                typeName = null;
            }

            short attributeUse;
            String defaultValue = null;

            // [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
            //        | (('#FIXED' S)? AttValue)
            whitespace("F-003");
            if (peek("#REQUIRED"))
                attributeUse = DTDEventListener.USE_REQUIRED;
///            a.setIsRequired(true);
            else if (peek("#FIXED")) {
///            if (a.type() == Attribute.ID)
                if (typeName == TYPE_ID)
                    error("V-017", new Object[]{attName});
///            a.setIsFixed(true);
                attributeUse = DTDEventListener.USE_FIXED;
                whitespace("F-004");
                parseLiteral(false);
///            if (a.type() != Attribute.CDATA)
///                a.setDefaultValue(normalize(false));
///            else
///                a.setDefaultValue(strTmp.toString());

                if (typeName == TYPE_CDATA)
                    defaultValue = normalize(false);
                else
                    defaultValue = strTmp.toString();

// TODO: implement this check
///            if (a.type() != Attribute.CDATA)
///                validateAttributeSyntax (a, a.defaultValue());
            } else if (!peek("#IMPLIED")) {
                attributeUse = DTDEventListener.USE_IMPLIED;

///            if (a.type() == Attribute.ID)
                if (typeName == TYPE_ID)
                    error("V-018", new Object[]{attName});
                parseLiteral(false);
///            if (a.type() != Attribute.CDATA)
///                a.setDefaultValue(normalize(false));
///            else
///                a.setDefaultValue(strTmp.toString());
                if (typeName == TYPE_CDATA)
                    defaultValue = normalize(false);
                else
                    defaultValue = strTmp.toString();

// TODO: implement this check
///            if (a.type() != Attribute.CDATA)
///                validateAttributeSyntax (a, a.defaultValue());
            } else {
                // TODO: this looks like an fatal error.
                attributeUse = DTDEventListener.USE_NORMAL;
            }

            if (XmlLang.equals(attName)
                    && defaultValue/* a.defaultValue()*/ != null
                    && !isXmlLang(defaultValue/*a.defaultValue()*/))
                error("P-033", new Object[]{defaultValue /*a.defaultValue()*/});

// TODO: isn't it an error to specify the same attribute twice?
///        if (!element.attributes().contains(a)) {
///            element.addAttribute(a);
///            dtdHandler.attributeDecl(a);
///        }

            String[] v = (values != null) ? (String[]) values.toArray(new String[0]) : null;
            dtdHandler.attributeDecl(elementName, attName, typeName, v, attributeUse, defaultValue);
            maybeWhitespace();
        }
        if (start != in)
            error("V-013", null);
        return true;
    }

    // used when parsing literal attribute values,
    // or public identifiers.
    //
    // input in strTmp
    private String normalize(boolean invalidIfNeeded) {

        // this can allocate an extra string...

        String s = strTmp.toString();
        String s2 = s.trim();
        boolean didStrip = false;

        if (s != s2) {
            s = s2;
            s2 = null;
            didStrip = true;
        }
        strTmp = new StringBuffer();
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (!XmlChars.isSpace(c)) {
                strTmp.append(c);
                continue;
            }
            strTmp.append(' ');
            while (++i < s.length() && XmlChars.isSpace(s.charAt(i)))
                didStrip = true;
            i--;
        }
        if (didStrip)
            return strTmp.toString();
        else
            return s;
    }

    private boolean maybeConditionalSect()
            throws IOException, SAXException {

        // [61] conditionalSect ::= includeSect | ignoreSect

        if (!peek("<!["))
            return false;

        String keyword;
        InputEntity start = in;

        maybeWhitespace();

        if ((keyword = maybeGetName()) == null)
            fatal("P-046");
        maybeWhitespace();
        nextChar('[', "F-030", null);

        // [62] includeSect ::= '<![' S? 'INCLUDE' S? '['
        //                extSubsetDecl ']]>'
        if ("INCLUDE".equals(keyword)) {
            for (; ;) {
                while (in.isEOF() && in != start)
                    in = in.pop();
                if (in.isEOF()) {
                    error("V-020", null);
                }
                if (peek("]]>"))
                    break;

                doLexicalPE = false;
                if (maybeWhitespace())
                    continue;
                if (maybePEReference())
                    continue;
                doLexicalPE = true;
                if (maybeMarkupDecl() || maybeConditionalSect())
                    continue;

                fatal("P-047");
            }

            // [63] ignoreSect ::= '<![' S? 'IGNORE' S? '['
            //            ignoreSectcontents ']]>'
            // [64] ignoreSectcontents ::= Ignore ('<!['
            //            ignoreSectcontents ']]>' Ignore)*
            // [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
        } else if ("IGNORE".equals(keyword)) {
            int nestlevel = 1;
            // ignoreSectcontents
            doLexicalPE = false;
            while (nestlevel > 0) {
                char c = getc();    // will pop input entities
                if (c == '<') {
                    if (peek("!["))
                        nestlevel++;
                } else if (c == ']') {
                    if (peek("]>"))
                        nestlevel--;
                } else
                    continue;
            }
        } else
            fatal("P-048", new Object[]{keyword});
        return true;
    }


    //
    // CHAPTER 4:  Physical Structures
    //

    // parse decimal or hex numeric character reference
    private int parseCharNumber()
            throws IOException, SAXException {

        char c;
        int retval = 0;

        // n.b. we ignore overflow ...
        if (getc() != 'x') {
            ungetc();
            for (; ;) {
                c = getc();
                if (c >= '0' && c <= '9') {
                    retval *= 10;
                    retval += (c - '0');
                    continue;
                }
                if (c == ';')
                    return retval;
                fatal("P-049");
            }
        } else
            for (; ;) {
                c = getc();
                if (c >= '0' && c <= '9') {
                    retval <<= 4;
                    retval += (c - '0');
                    continue;
                }
                if (c >= 'a' && c <= 'f') {
                    retval <<= 4;
                    retval += 10 + (c - 'a');
                    continue;
                }
                if (c >= 'A' && c <= 'F') {
                    retval <<= 4;
                    retval += 10 + (c - 'A');
                    continue;
                }
                if (c == ';')
                    return retval;
                fatal("P-050");
            }
    }

    // parameter is a UCS-4 character ... i.e. not just 16 bit UNICODE,
    // though still subject to the 'Char' construct in XML
    private int surrogatesToCharTmp(int ucs4)
            throws SAXException {

        if (ucs4 <= 0xffff) {
            if (XmlChars.isChar(ucs4)) {
                charTmp[0] = (char) ucs4;
                return 1;
            }
        } else if (ucs4 <= 0x0010ffff) {
            // we represent these as UNICODE surrogate pairs
            ucs4 -= 0x10000;
            charTmp[0] = (char) (0xd800 | ((ucs4 >> 10) & 0x03ff));
            charTmp[1] = (char) (0xdc00 | (ucs4 & 0x03ff));
            return 2;
        }
        fatal("P-051", new Object[]{Integer.toHexString(ucs4)});
        // NOTREACHED
        return -1;
    }

    private boolean maybePEReference()
            throws IOException, SAXException {

        // This is the SYNTACTIC version of this construct.
        // When processing external entities, there is also
        // a LEXICAL version; see getc() and doLexicalPE.

        // [69] PEReference ::= '%' Name ';'
        if (!in.peekc('%'))
            return false;

        String name = maybeGetName();
        Object entity;

        if (name == null)
            fatal("P-011");
        nextChar(';', "F-021", name);
        entity = params.get(name);

        if (entity instanceof InternalEntity) {
            InternalEntity value = (InternalEntity) entity;
            pushReader(value.buf, name, false);

        } else if (entity instanceof ExternalEntity) {
            pushReader((ExternalEntity) entity);
            externalParameterEntity((ExternalEntity) entity);

        } else if (entity == null) {
            error("V-022", new Object[]{name});
        }
        return true;
    }

    private boolean maybeEntityDecl()
            throws IOException, SAXException {

        // [70] EntityDecl ::= GEDecl | PEDecl
        // [71] GEDecl ::= '<!ENTITY' S       Name S EntityDef S? '>'
        // [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDEF     S? '>'
        // [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
        // [74] PEDef     ::= EntityValue |  ExternalID
        //
        InputEntity start = peekDeclaration("!ENTITY");

        if (start == null)
            return false;

        String entityName;
        SimpleHashtable defns;
        ExternalEntity externalId;
        boolean doStore;

        // PE expansion gets selectively turned off several places:
        // in ENTITY declarations (here), in comments, in PIs.

        // Here, we allow PE entities to be declared, and allows
        // literals to include PE refs without the added spaces
        // required with their expansion in markup decls.

        doLexicalPE = false;
        whitespace("F-005");
        if (in.peekc('%')) {
            whitespace("F-006");
            defns = params;
        } else
            defns = entities;

        ungetc();    // leave some whitespace
        doLexicalPE = true;
        entityName = getMarkupDeclname("F-017", false);
        whitespace("F-007");
        externalId = maybeExternalID();

        //
        // first definition sticks ... e.g. internal subset PEs are used
        // to override DTD defaults.  It's also an "error" to incorrectly
        // redefine builtin internal entities, but since reporting such
        // errors is optional we only give warnings ("just in case") for
        // non-parameter entities.
        //
        doStore = (defns.get(entityName) == null);
        if (!doStore && defns == entities)
            warning("P-054", new Object[]{entityName});

        // internal entities
        if (externalId == null) {
            char value [];
            InternalEntity entity;

            doLexicalPE = false;        // "ab%bar;cd" -maybe-> "abcd"
            parseLiteral(true);
            doLexicalPE = true;
            if (doStore) {
                value = new char[strTmp.length()];
                if (value.length != 0)
                    strTmp.getChars(0, value.length, value, 0);
                entity = new InternalEntity(entityName, value);
                entity.isPE = (defns == params);
                entity.isFromInternalSubset = false;
                defns.put(entityName, entity);
                if (defns == entities)
                    dtdHandler.internalGeneralEntityDecl(entityName,
                            new String(value));
            }

            // external entities (including unparsed)
        } else {
            // [76] NDataDecl ::= S 'NDATA' S Name
            if (defns == entities && maybeWhitespace()
                    && peek("NDATA")) {
                externalId.notation = getMarkupDeclname("F-018", false);

                // flag undeclared notation for checking after
                // the DTD is fully processed
                if (notations.get(externalId.notation) == null)
                    notations.put(externalId.notation, Boolean.TRUE);
            }
            externalId.name = entityName;
            externalId.isPE = (defns == params);
            externalId.isFromInternalSubset = false;
            if (doStore) {
                defns.put(entityName, externalId);
                if (externalId.notation != null)
                    dtdHandler.unparsedEntityDecl(entityName,
                            externalId.publicId, externalId.systemId,
                            externalId.notation);
                else if (defns == entities)
                    dtdHandler.externalGeneralEntityDecl(entityName,
                            externalId.publicId, externalId.systemId);
            }
        }
        maybeWhitespace();
        nextChar('>', "F-031", entityName);
        if (start != in)
            error("V-013", null);
        return true;
    }

    private ExternalEntity maybeExternalID()
            throws IOException, SAXException {

        // [75] ExternalID ::= 'SYSTEM' S SystemLiteral
        //        | 'PUBLIC' S' PubidLiteral S Systemliteral
        String temp = null;
        ExternalEntity retval;

        if (peek("PUBLIC")) {
            whitespace("F-009");
            temp = parsePublicId();
        } else if (!peek("SYSTEM"))
            return null;

        retval = new ExternalEntity(in);
        retval.publicId = temp;
        whitespace("F-008");
        retval.systemId = parseSystemId();
        return retval;
    }

    private String parseSystemId()
            throws IOException, SAXException {

        String uri = getQuotedString("F-034", null);
        int temp = uri.indexOf(':');

        // resolve relative URIs ... must do it here since
        // it's relative to the source file holding the URI!

        // "new java.net.URL (URL, string)" conforms to RFC 1630,
        // but we can't use that except when the URI is a URL.
        // The entity resolver is allowed to handle URIs that are
        // not URLs, so we pass URIs through with scheme intact
        if (temp == -1 || uri.indexOf('/') < temp) {
            String baseURI;

            baseURI = in.getSystemId();
            if (baseURI == null)
                fatal("P-055", new Object[]{uri});
            if (uri.length() == 0)
                uri = ".";
            baseURI = baseURI.substring(0, baseURI.lastIndexOf('/') + 1);
            if (uri.charAt(0) != '/')
                uri = baseURI + uri;
            else {
                // XXX slashes at the beginning of a relative URI are
                // a special case we don't handle.
                throw new InternalError();
            }

            // letting other code map any "/xxx/../" or "/./" to "/",
            // since all URIs must handle it the same.
        }
        // check for fragment ID in URI
        if (uri.indexOf('#') != -1)
            error("P-056", new Object[]{uri});
        return uri;
    }

    private void maybeTextDecl()
            throws IOException, SAXException {

        // [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
        if (peek("<?xml")) {
            readVersion(false, "1.0");
            readEncoding(true);
            maybeWhitespace();
            if (!peek("?>"))
                fatal("P-057");
        }
    }

    private void externalParameterEntity(ExternalEntity next)
            throws IOException, SAXException {

        //
        // Reap the intended benefits of standalone declarations:
        // don't deal with external parameter entities, except to
        // validate the standalone declaration.
        //

        // n.b. "in external parameter entities" (and external
        // DTD subset, same grammar) parameter references can
        // occur "within" markup declarations ... expansions can
        // cross syntax rules.  Flagged here; affects getc().

        // [79] ExtPE ::= TextDecl? extSubsetDecl
        // [31] extSubsetDecl ::= ( markupdecl | conditionalSect
        //        | PEReference | S )*
        InputEntity pe;

        // XXX if this returns false ...

        pe = in;
        maybeTextDecl();
        while (!pe.isEOF()) {
            // pop internal PEs (and whitespace before/after)
            if (in.isEOF()) {
                in = in.pop();
                continue;
            }
            doLexicalPE = false;
            if (maybeWhitespace())
                continue;
            if (maybePEReference())
                continue;
            doLexicalPE = true;
            if (maybeMarkupDecl() || maybeConditionalSect())
                continue;
            break;
        }
        // if (in != pe) throw new InternalError("who popped my PE?");
        if (!pe.isEOF())
            fatal("P-059", new Object[]{in.getName()});
    }

    private void readEncoding(boolean must)
            throws IOException, SAXException {

        // [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
        String name = maybeReadAttribute("encoding", must);

        if (name == null)
            return;
        for (int i = 0; i < name.length(); i++) {
            char c = name.charAt(i);
            if ((c >= 'A' && c <= 'Z')
                    || (c >= 'a' && c <= 'z'))
                continue;
            if (i != 0
                    && ((c >= '0' && c <= '9')
                    || c == '-'
                    || c == '_'
                    || c == '.'
                    ))
                continue;
            fatal("P-060", new Object[]{new Character(c)});
        }

        //
        // This should be the encoding in use, and it's even an error for
        // it to be anything else (in certain cases that are impractical to
        // to test, and may even be insufficient).  So, we do the best we
        // can, and warn if things look suspicious.  Note that Java doesn't
        // uniformly expose the encodings, and that the names it uses
        // internally are nonstandard.  Also, that the XML spec allows
        // such "errors" not to be reported at all.
        //
        String currentEncoding = in.getEncoding();

        if (currentEncoding != null
                && !name.equalsIgnoreCase(currentEncoding))
            warning("P-061", new Object[]{name, currentEncoding});
    }

    private boolean maybeNotationDecl()
            throws IOException, SAXException {

        // [82] NotationDecl ::= '<!NOTATION' S Name S
        //        (ExternalID | PublicID) S? '>'
        // [83] PublicID ::= 'PUBLIC' S PubidLiteral
        InputEntity start = peekDeclaration("!NOTATION");

        if (start == null)
            return false;

        String name = getMarkupDeclname("F-019", false);
        ExternalEntity entity = new ExternalEntity(in);

        whitespace("F-011");
        if (peek("PUBLIC")) {
            whitespace("F-009");
            entity.publicId = parsePublicId();
            if (maybeWhitespace()) {
                if (!peek(">"))
                    entity.systemId = parseSystemId();
                else
                    ungetc();
            }
        } else if (peek("SYSTEM")) {
            whitespace("F-008");
            entity.systemId = parseSystemId();
        } else
            fatal("P-062");
        maybeWhitespace();
        nextChar('>', "F-032", name);
        if (start != in)
            error("V-013", null);
        if (entity.systemId != null && entity.systemId.indexOf('#') != -1)
            error("P-056", new Object[]{entity.systemId});

        Object value = notations.get(name);
        if (value != null && value instanceof ExternalEntity)
            warning("P-063", new Object[]{name});

        else {
            notations.put(name, entity);
            dtdHandler.notationDecl(name, entity.publicId,
                    entity.systemId);
        }
        return true;
    }


    ////////////////////////////////////////////////////////////////
    //
    //    UTILITIES
    //
    ////////////////////////////////////////////////////////////////

    private char getc() throws IOException, SAXException {

        if (!doLexicalPE) {
            char c = in.getc();
            return c;
        }

        //
        // External parameter entities get funky processing of '%param;'
        // references.  It's not clearly defined in the XML spec; but it
        // boils down to having those refs be _lexical_ in most cases to
        // include partial syntax productions.  It also needs selective
        // enabling; "<!ENTITY % foo ...>" must work, for example, and
        // if "bar" is an empty string PE, "ab%bar;cd" becomes "abcd"
        // if it's expanded in a literal, else "ab  cd".  PEs also do
        // not expand within comments or PIs, and external PEs are only
        // allowed to have markup decls (and so aren't handled lexically).
        //
        // This PE handling should be merged into maybeWhitespace, where
        // it can be dealt with more consistently.
        //
        // Also, there are some validity constraints in this area.
        //
        char c;

        while (in.isEOF()) {
            if (in.isInternal() || (doLexicalPE && !in.isDocument()))
                in = in.pop();
            else {
                fatal("P-064", new Object[]{in.getName()});
            }
        }
        if ((c = in.getc()) == '%' && doLexicalPE) {
            // PE ref ::= '%' name ';'
            String name = maybeGetName();
            Object entity;

            if (name == null)
                fatal("P-011");
            nextChar(';', "F-021", name);
            entity = params.get(name);

            // push a magic "entity" before and after the
            // real one, so ungetc() behaves uniformly
            pushReader(" ".toCharArray(), null, false);
            if (entity instanceof InternalEntity)
                pushReader(((InternalEntity) entity).buf, name, false);
            else if (entity instanceof ExternalEntity)
            // PEs can't be unparsed!
            // XXX if this returns false ...
                pushReader((ExternalEntity) entity);
            else if (entity == null)
            // see note in maybePEReference re making this be nonfatal.
                fatal("V-022");
            else
                throw new InternalError();
            pushReader(" ".toCharArray(), null, false);
            return in.getc();
        }
        return c;
    }

    private void ungetc() {

        in.ungetc();
    }

    private boolean peek(String s)
            throws IOException, SAXException {

        return in.peek(s, null);
    }

    // Return the entity starting the specified declaration
    // (for validating declaration nesting) else null.

    private InputEntity peekDeclaration(String s)
            throws IOException, SAXException {

        InputEntity start;

        if (!in.peekc('<'))
            return null;
        start = in;
        if (in.peek(s, null))
            return start;
        in.ungetc();
        return null;
    }

    private void nextChar(char c, String location, String near)
            throws IOException, SAXException {

        while (in.isEOF() && !in.isDocument())
            in = in.pop();
        if (!in.peekc(c))
            fatal("P-008", new Object[]
            {new Character(c),
             messages.getMessage(locale, location),
             (near == null ? "" : ('"' + near + '"'))});
    }


    private void pushReader(char buf [], String name, boolean isGeneral)
            throws SAXException {

        InputEntity r = InputEntity.getInputEntity(dtdHandler, locale);
        r.init(buf, name, in, !isGeneral);
        in = r;
    }

    private boolean pushReader(ExternalEntity next)
            throws IOException, SAXException {

        InputEntity r = InputEntity.getInputEntity(dtdHandler, locale);
        InputSource s;
        try {
            s = next.getInputSource(resolver);
        } catch (IOException e) {
            String msg =
                    "unable to open the external entity from :" + next.systemId;
            if (next.publicId != null)
                msg += " (public id:" + next.publicId + ")";

            SAXParseException spe = new SAXParseException(msg,
                    getPublicId(), getSystemId(), getLineNumber(), getColumnNumber(), e);
            dtdHandler.fatalError(spe);
            throw e;
        }

        r.init(s, next.name, in, next.isPE);
        in = r;
        return true;
    }

    public String getPublicId() {

        return (in == null) ? null : in.getPublicId();
    }

    public String getSystemId() {

        return (in == null) ? null : in.getSystemId();
    }

    public int getLineNumber() {

        return (in == null) ? -1 : in.getLineNumber();
    }

    public int getColumnNumber() {

        return (in == null) ? -1 : in.getColumnNumber();
    }

    // error handling convenience routines

    private void warning(String messageId, Object parameters [])
            throws SAXException {

        SAXParseException e = new SAXParseException(messages.getMessage(locale, messageId, parameters),
                getPublicId(), getSystemId(), getLineNumber(), getColumnNumber());

        dtdHandler.warning(e);
    }

    void error(String messageId, Object parameters [])
            throws SAXException {

        SAXParseException e = new SAXParseException(messages.getMessage(locale, messageId, parameters),
                getPublicId(), getSystemId(), getLineNumber(), getColumnNumber());

        dtdHandler.error(e);
    }

    private void fatal(String messageId) throws SAXException {

        fatal(messageId, null);
    }

    private void fatal(String messageId, Object parameters [])
            throws SAXException {

        SAXParseException e = new SAXParseException(messages.getMessage(locale, messageId, parameters),
                getPublicId(), getSystemId(), getLineNumber(), getColumnNumber());

        dtdHandler.fatalError(e);

        throw e;
    }

    //
    // Map char arrays to strings ... cuts down both on memory and
    // CPU usage for element/attribute/other names that are reused.
    //
    // Documents typically repeat names a lot, so we more or less
    // intern all the strings within the document; since some strings
    // are repeated in multiple documents (e.g. stylesheets) we go
    // a bit further, and intern globally.
    //
    static class NameCache {
        //
        // Unless we auto-grow this, the default size should be a
        // reasonable bit larger than needed for most XML files
        // we've yet seen (and be prime).  If it's too small, the
        // penalty is just excess cache collisions.
        //
        NameCacheEntry hashtable [] = new NameCacheEntry[541];

        //
        // Usually we just want to get the 'symbol' for these chars
        //
        String lookup(char value [], int len) {

            return lookupEntry(value, len).name;
        }

        //
        // Sometimes we need to scan the chars in the resulting
        // string, so there's an accessor which exposes them.
        // (Mostly for element end tags.)
        //
        NameCacheEntry lookupEntry(char value [], int len) {

            int index = 0;
            NameCacheEntry entry;

            // hashing to get index
            for (int i = 0; i < len; i++)
                index = index * 31 + value[i];
            index &= 0x7fffffff;
            index %= hashtable.length;

            // return entry if one's there ...
            for (entry = hashtable[index];
                 entry != null;
                 entry = entry.next) {
                if (entry.matches(value, len))
                    return entry;
            }

            // else create new one
            entry = new NameCacheEntry();
            entry.chars = new char[len];
            System.arraycopy(value, 0, entry.chars, 0, len);
            entry.name = new String(entry.chars);
            //
            // NOTE:  JDK 1.1 has a fixed size string intern table,
            // with non-GC'd entries.  It can panic here; that's a
            // JDK problem, use 1.2 or later with many identifiers.
            //
            entry.name = entry.name.intern();        // "global" intern
            entry.next = hashtable[index];
            hashtable[index] = entry;
            return entry;
        }
    }

    static class NameCacheEntry {

        String name;
        char chars [];
        NameCacheEntry next;

        boolean matches(char value [], int len) {

            if (chars.length != len)
                return false;
            for (int i = 0; i < len; i++)
                if (value[i] != chars[i])
                    return false;
            return true;
        }
    }

    //
    // Message catalog for diagnostics.
    //
    static final Catalog messages = new Catalog();

    static final class Catalog extends MessageCatalog {

        Catalog() {
            super(DTDParser.class);
        }
    }

}