internal/dtdparser/Resolver.java

	Resolver.java revision 325
0N/A/*
29N/A * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0N/A *
0N/A * This code is free software; you can redistribute it and/or modify it
0N/A * under the terms of the GNU General Public License version 2 only, as
0N/A * published by the Free Software Foundation.  Oracle designates this
0N/A * particular file as subject to the "Classpath" exception as provided
0N/A * by Oracle in the LICENSE file that accompanied this code.
0N/A *
0N/A * This code is distributed in the hope that it will be useful, but WITHOUT
0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0N/A * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
0N/A * version 2 for more details (a copy is included in the LICENSE file that
0N/A * accompanied this code).
0N/A *
0N/A * You should have received a copy of the GNU General Public License version
0N/A * 2 along with this work; if not, write to the Free Software Foundation,
0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0N/A *
0N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
0N/A * or visit www.oracle.com if you need additional information or have any
0N/A * questions.
0N/A */
0N/A
0N/Apackage com.sun.xml.internal.dtdparser;
0N/A
0N/Aimport org.xml.sax.EntityResolver;
0N/Aimport org.xml.sax.InputSource;
0N/A
0N/Aimport java.io.File;
0N/Aimport java.io.FileInputStream;
0N/Aimport java.io.IOException;
0N/Aimport java.io.InputStream;
0N/Aimport java.net.URL;
0N/Aimport java.net.URLConnection;
0N/Aimport java.util.Hashtable;
0N/A
0N/A/**
0N/A * This entity resolver class provides a number of utilities which can help
0N/A * managment of external parsed entities in XML.  These are commonly used
29N/A * to hold markup declarations that are to be used as part of a Document
0N/A * Type Declaration (DTD), or to hold text marked up with XML.
0N/A * <p/>
0N/A * <P> Features include: <UL>
0N/A * <p/>
0N/A * <LI> Static factory methods are provided for constructing SAX InputSource
0N/A * objects from Files, URLs, or MIME objects.  This eliminates a class of
0N/A * error-prone coding in applications.
0N/A * <p/>
0N/A * <LI> Character encodings for XML documents are correctly supported: <UL>
0N/A * <p/>
0N/A * <LI> The encodings defined in the RFCs for MIME content types
0N/A * (2046 for general MIME, and 2376 for XML in particular), are
0N/A * supported, handling <em>charset=...</em> attributes and accepting
0N/A * content types which are known to be safe for use with XML;
0N/A * <p/>
0N/A * <LI> The character encoding autodetection algorithm identified
0N/A * in the XML specification is used, and leverages all of
0N/A * the JDK 1.1 (and later) character encoding support.
0N/A * <p/>
0N/A * <LI> The use of MIME typing may optionally be disabled, forcing the
0N/A * use of autodetection, to support web servers which don't correctly
0N/A * report MIME types for XML.  For example, they may report text that
0N/A * is encoded in EUC-JP as being US-ASCII text, leading to fatal
0N/A * errors during parsing.
0N/A * <p/>
0N/A * <LI> The InputSource objects returned by this class always
0N/A * have a <code>java.io.Reader</code> available as the "character
0N/A * stream" property.
0N/A * <p/>
0N/A * </UL>
0N/A * <p/>
29N/A * <LI> Catalog entries can map public identifiers to Java resources or
0N/A * to local URLs.  These are used to reduce network dependencies and loads,
0N/A * and will often be used for external DTD components.  For example, packages
0N/A * shipping DTD files as resources in JAR files can eliminate network traffic
0N/A * when accessing them, and sites may provide local caches of common DTDs.
0N/A * Note that no particular catalog syntax is supported by this class, only
0N/A * the notion of a set of entries.
29N/A * <p/>
0N/A * </UL>
671N/A * <p/>
0N/A * <P> Subclasses can perform tasks such as supporting new URI schemes for
0N/A * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
0N/A * MIME entities which are part of a <em>multipart/related</em> group
0N/A * (see RFC 2387).  They may also be used to support particular catalog
0N/A * syntaxes, such as the <a href="http://www.oasis-open.org/html/a401.htm">
0N/A * SGML/Open Catalog (SOCAT)</a> which supports the SGML notion of "Formal
0N/A * Public Identifiers (FPIs).
0N/A *
0N/A * @author David Brownell
0N/A * @author Janet Koenig
0N/A * @version 1.3 00/02/24
0N/A */
0N/Apublic class Resolver implements EntityResolver {
0N/A    private boolean ignoringMIME;
29N/A
29N/A    // table mapping public IDs to (local) URIs
29N/A    private Hashtable id2uri;
29N/A
672N/A    // tables mapping public IDs to resources and classloaders
29N/A    private Hashtable id2resource;
29N/A    private Hashtable id2loader;
29N/A
29N/A    //
29N/A    // table of MIME content types (less attributes!) known
0N/A    // to be mostly "OK" to use with XML MIME entities.  the
0N/A    // idea is to rule out obvious braindamage ("image/jpg")
0N/A    // not the subtle stuff ("text/html") that might actually
0N/A    // be (or become) safe.
0N/A    //
0N/A    private static final String types [] = {
29N/A        "application/xml",
0N/A        "text/xml",
0N/A        "text/plain",
0N/A        "text/html", // commonly mis-inferred
0N/A        "application/x-netcdf", // this is often illegal XML
671N/A        "content/unknown"
671N/A    };
0N/A
0N/A    /**
0N/A     * Constructs a resolver.
0N/A     */
0N/A    public Resolver() {
0N/A    }
0N/A
0N/A    /**
0N/A     * Returns an input source, using the MIME type information and URL
0N/A     * scheme to statically determine the correct character encoding if
0N/A     * possible and otherwise autodetecting it.  MIME carefully specifies
0N/A     * the character encoding defaults, and how attributes of the content
0N/A     * type can change it.  XML further specifies two mandatory encodings
0N/A     * (UTF-8 and UTF-16), and includes an XML declaration which can be
0N/A     * used to internally label most documents encoded using US-ASCII
0N/A     * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
0N/A     * more).
0N/A     * <p/>
0N/A     * <P> This method can be used to access XML documents which do not
0N/A     * have URIs (such as servlet input streams, or most JavaMail message
0N/A     * entities) and to support access methods such as HTTP POST or PUT.
0N/A     * (URLs normally return content using the GET method.)
0N/A     * <p/>
29N/A     * <P> <em> The caller should set the system ID in order for relative URIs
29N/A     * found in this document to be interpreted correctly.</em> In some cases,
29N/A     * a custom resolver will need to be used; for example, documents
29N/A     * may be grouped in a single MIME "multipart/related" bundle, and
29N/A     * relative URLs would refer to other documents in that bundle.
29N/A     *
29N/A     * @param contentType The MIME content type for the source for which
29N/A     *                    an InputSource is desired, such as <em>text/xml;charset=utf-8</em>.
29N/A     * @param stream      The input byte stream for the input source.
29N/A     * @param checkType   If true, this verifies that the content type is known
29N/A     *                    to support XML documents, such as <em>application/xml</em>.
29N/A     * @param scheme      Unless this is "file", unspecified MIME types
29N/A     *                    default to US-ASCII.  Files are always autodetected since most
29N/A     *                    file systems discard character encoding information.
29N/A     */
0N/A    public static InputSource createInputSource(String contentType,
0N/A                                                InputStream stream,
0N/A                                                boolean checkType,
0N/A                                                String scheme) throws IOException {
0N/A        InputSource retval;
0N/A        String charset = null;
0N/A
0N/A        if (contentType != null) {
0N/A            int index;
0N/A
0N/A            contentType = contentType.toLowerCase();
0N/A            index = contentType.indexOf(';');
0N/A            if (index != -1) {
0N/A                String attributes;
0N/A
0N/A                attributes = contentType.substring(index + 1);
0N/A                contentType = contentType.substring(0, index);
0N/A
29N/A                // use "charset=..." if it's available
0N/A                index = attributes.indexOf("charset");
0N/A                if (index != -1) {
0N/A                    attributes = attributes.substring(index + 7);
0N/A                    // strip out subsequent attributes
0N/A                    if ((index = attributes.indexOf(';')) != -1)
0N/A                        attributes = attributes.substring(0, index);
0N/A                    // find start of value
29N/A                    if ((index = attributes.indexOf('=')) != -1) {
0N/A                        attributes = attributes.substring(index + 1);
29N/A                        // strip out rfc822 comments
0N/A                        if ((index = attributes.indexOf('(')) != -1)
0N/A                            attributes = attributes.substring(0, index);
0N/A                        // double quotes are optional
29N/A                        if ((index = attributes.indexOf('"')) != -1) {
0N/A                            attributes = attributes.substring(index + 1);
29N/A                            attributes = attributes.substring(0,
0N/A                                    attributes.indexOf('"'));
29N/A                        }
0N/A                        charset = attributes.trim();
0N/A                        // XXX "\;", "\)" etc were mishandled above
0N/A                    }
0N/A                }
0N/A            }
29N/A
0N/A            //
0N/A            // Check MIME type.
0N/A            //
0N/A            if (checkType) {
0N/A                boolean isOK = false;
0N/A                for (int i = 0; i < types.length; i++)
0N/A                    if (types[i].equals(contentType)) {
0N/A                        isOK = true;
0N/A                        break;
0N/A                    }
0N/A                if (!isOK)
0N/A                    throw new IOException("Not XML: " + contentType);
0N/A            }
0N/A
0N/A            //
0N/A            // "text/*" MIME types have hard-wired character set
0N/A            // defaults, as specified in the RFCs.  For XML, we
0N/A            // ignore the system "file.encoding" property since
0N/A            // autodetection is more correct.
0N/A            //
0N/A            if (charset == null) {
0N/A                contentType = contentType.trim();
0N/A                if (contentType.startsWith("text/")) {
0N/A                    if (!"file".equalsIgnoreCase(scheme))
0N/A                        charset = "US-ASCII";
0N/A                }
0N/A                // "application/*" has no default
0N/A            }
29N/A        }
0N/A
0N/A        retval = new InputSource(XmlReader.createReader(stream, charset));
0N/A        retval.setByteStream(stream);
0N/A        retval.setEncoding(charset);
29N/A        return retval;
0N/A    }
29N/A
0N/A
29N/A    /**
0N/A     * Creates an input source from a given URI.
29N/A     *
0N/A     * @param uri       the URI (system ID) for the entity
0N/A     * @param checkType if true, the MIME content type for the entity
29N/A     *                  is checked for document type and character set encoding.
0N/A     */
0N/A    static public InputSource createInputSource(URL uri, boolean checkType)
0N/A            throws IOException {
0N/A
0N/A        URLConnection conn = uri.openConnection();
0N/A        InputSource retval;
0N/A
0N/A        if (checkType) {
0N/A            String contentType = conn.getContentType();
0N/A            retval = createInputSource(contentType, conn.getInputStream(),
29N/A                    false, uri.getProtocol());
0N/A        } else {
29N/A            retval = new InputSource(XmlReader.createReader(conn.getInputStream()));
0N/A        }
29N/A        retval.setSystemId(conn.getURL().toString());
0N/A        return retval;
0N/A    }
0N/A
0N/A
0N/A    /**
0N/A     * Creates an input source from a given file, autodetecting
0N/A     * the character encoding.
0N/A     */
0N/A    static public InputSource createInputSource(File file)
0N/A            throws IOException {
0N/A        InputSource retval;
0N/A        String path;
0N/A
0N/A        retval = new InputSource(XmlReader.createReader(new FileInputStream(file)));
0N/A
0N/A        // On JDK 1.2 and later, simplify this:
0N/A        //    "path = file.toURL ().toString ()".
0N/A        path = file.getAbsolutePath();
0N/A        if (File.separatorChar != '/')
0N/A            path = path.replace(File.separatorChar, '/');
0N/A        if (!path.startsWith("/"))
29N/A            path = "/" + path;
29N/A        if (!path.endsWith("/") && file.isDirectory())
29N/A            path = path + "/";
0N/A
0N/A        retval.setSystemId("file:" + path);
0N/A        return retval;
0N/A    }
0N/A
0N/A
29N/A    /**
0N/A     * <b>SAX:</b>
0N/A     * Resolve the given entity into an input source.  If the name can't
0N/A     * be mapped to a preferred form of the entity, the URI is used.  To
0N/A     * resolve the entity, first a local catalog mapping names to URIs is
0N/A     * consulted.  If no mapping is found there, a catalog mapping names
0N/A     * to java resources is consulted.  Finally, if neither mapping found
29N/A     * a copy of the entity, the specified URI is used.
0N/A     * <p/>
0N/A     * <P> When a URI is used, <a href="#createInputSource">
0N/A     * createInputSource</a> is used to correctly deduce the character
0N/A     * encoding used by this entity.  No MIME type checking is done.
0N/A     *
0N/A     * @param name Used to find alternate copies of the entity, when
0N/A     *             this value is non-null; this is the XML "public ID".
0N/A     * @param uri  Used when no alternate copy of the entity is found;
0N/A     *             this is the XML "system ID", normally a URI.
0N/A     */
0N/A    public InputSource resolveEntity(String name, String uri)
0N/A            throws IOException {
0N/A        InputSource retval;
29N/A        String mappedURI = name2uri(name);
0N/A        InputStream stream;
0N/A
0N/A        // prefer explicit URI mappings, then bundled resources...
0N/A        if (mappedURI == null && (stream = mapResource(name)) != null) {
0N/A            uri = "java:resource:" + (String) id2resource.get(name);
0N/A            retval = new InputSource(XmlReader.createReader(stream));
29N/A
0N/A            // ...and treat all URIs the same (as URLs for now).
0N/A        } else {
0N/A            URL url;
0N/A            URLConnection conn;
0N/A
0N/A            if (mappedURI != null)
29N/A                uri = mappedURI;
0N/A            else if (uri == null)
0N/A                return null;
0N/A
0N/A            url = new URL(uri);
0N/A            conn = url.openConnection();
0N/A            uri = conn.getURL().toString();
0N/A            // System.out.println ("++ URI: " + url);
0N/A            if (ignoringMIME)
0N/A                retval = new InputSource(XmlReader.createReader(conn.getInputStream()));
0N/A            else {
0N/A                String contentType = conn.getContentType();
0N/A                retval = createInputSource(contentType,
29N/A                        conn.getInputStream(),
0N/A                        false, url.getProtocol());
0N/A            }
0N/A        }
0N/A        retval.setSystemId(uri);
0N/A        retval.setPublicId(name);
0N/A        return retval;
0N/A    }
0N/A
0N/A
0N/A    /**
0N/A     * Returns true if this resolver is ignoring MIME types in the documents
29N/A     * it returns, to work around bugs in how servers have reported the
0N/A     * documents' MIME types.
0N/A     */
0N/A    public boolean isIgnoringMIME() {
0N/A        return ignoringMIME;
29N/A    }
0N/A
0N/A    /**
0N/A     * Tells the resolver whether to ignore MIME types in the documents it
0N/A     * retrieves.  Many web servers incorrectly assign text documents a
0N/A     * default character encoding, even when that is incorrect.  For example,
0N/A     * all HTTP text documents default to use ISO-8859-1 (used for Western
0N/A     * European languages), and other MIME sources default text documents
0N/A     * to use US-ASCII (a seven bit encoding).  For XML documents which
0N/A     * include text encoding declarations (as most should do), these server
0N/A     * bugs can be worked around by ignoring the MIME type entirely.
0N/A     */
0N/A    public void setIgnoringMIME(boolean value) {
0N/A        ignoringMIME = value;
0N/A    }
0N/A
0N/A
0N/A    // maps the public ID to an alternate URI, if one is registered
0N/A    private String name2uri(String publicId) {
0N/A        if (publicId == null || id2uri == null)
0N/A            return null;
0N/A        return (String) id2uri.get(publicId);
0N/A    }
0N/A
0N/A
0N/A    /**
0N/A     * Registers the given public ID as corresponding to a particular
0N/A     * URI, typically a local copy.  This URI will be used in preference
0N/A     * to ones provided as system IDs in XML entity declarations.  This
0N/A     * mechanism would most typically be used for Document Type Definitions
0N/A     * (DTDs), where the public IDs are formally managed and versioned.
0N/A     *
0N/A     * @param publicId The managed public ID being mapped
0N/A     * @param uri      The URI of the preferred copy of that entity
0N/A     */
671N/A    public void registerCatalogEntry(String publicId,
0N/A                                     String uri) {
0N/A        if (id2uri == null)
0N/A            id2uri = new Hashtable(17);
0N/A        id2uri.put(publicId, uri);
0N/A    }
0N/A
0N/A
0N/A    // return the resource as a stream
0N/A    private InputStream mapResource(String publicId) {
0N/A        // System.out.println ("++ PUBLIC: " + publicId);
0N/A        if (publicId == null || id2resource == null)
0N/A            return null;
0N/A
0N/A        String resourceName = (String) id2resource.get(publicId);
29N/A        ClassLoader loader = null;
29N/A
0N/A        if (resourceName == null)
0N/A            return null;
29N/A        // System.out.println ("++ Resource: " + resourceName);
0N/A
0N/A        if (id2loader != null)
0N/A            loader = (ClassLoader) id2loader.get(publicId);
0N/A        // System.out.println ("++ Loader: " + loader);
0N/A        if (loader == null)
0N/A            return ClassLoader.getSystemResourceAsStream(resourceName);
29N/A        return loader.getResourceAsStream(resourceName);
0N/A    }
29N/A
29N/A    /**
0N/A     * Registers a given public ID as corresponding to a particular Java
0N/A     * resource in a given class loader, typically distributed with a
0N/A     * software package.  This resource will be preferred over system IDs
0N/A     * included in XML documents.  This mechanism should most typically be
29N/A     * used for Document Type Definitions (DTDs), where the public IDs are
29N/A     * formally managed and versioned.
0N/A     * <p/>
0N/A     * <P> If a mapping to a URI has been provided, that mapping takes
29N/A     * precedence over this one.
0N/A     *
0N/A     * @param publicId     The managed public ID being mapped
0N/A     * @param resourceName The name of the Java resource
29N/A     * @param loader       The class loader holding the resource, or null if
29N/A     *                     it is a system resource.
29N/A     */
29N/A    public void registerCatalogEntry(String publicId,
29N/A                                     String resourceName,
29N/A                                     ClassLoader loader) {
29N/A        if (id2resource == null)
29N/A            id2resource = new Hashtable(17);
29N/A        id2resource.put(publicId, resourceName);
29N/A
29N/A        if (loader != null) {
29N/A            if (id2loader == null)
29N/A                id2loader = new Hashtable(17);
29N/A            id2loader.put(publicId, loader);
29N/A        }
29N/A    }
0N/A}
0N/A