325N/A/*
325N/A * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
325N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
325N/A *
325N/A * This code is free software; you can redistribute it and/or modify it
325N/A * under the terms of the GNU General Public License version 2 only, as
325N/A * published by the Free Software Foundation. Oracle designates this
325N/A * particular file as subject to the "Classpath" exception as provided
325N/A * by Oracle in the LICENSE file that accompanied this code.
325N/A *
325N/A * This code is distributed in the hope that it will be useful, but WITHOUT
325N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
325N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
325N/A * version 2 for more details (a copy is included in the LICENSE file that
325N/A * accompanied this code).
325N/A *
325N/A * You should have received a copy of the GNU General Public License version
325N/A * 2 along with this work; if not, write to the Free Software Foundation,
325N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
325N/A *
325N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
325N/A * or visit www.oracle.com if you need additional information or have any
325N/A * questions.
325N/A */
325N/A
325N/Apackage com.sun.xml.internal.dtdparser;
325N/A
325N/A
325N/A/**
325N/A * Methods in this class are used to determine whether characters may
325N/A * appear in certain roles in XML documents. Such methods are used
325N/A * both to parse and to create such documents.
325N/A *
325N/A * @author David Brownell
325N/A * @version 1.1, 00/08/05
325N/A */
325N/Apublic class XmlChars {
325N/A // can't construct instances
325N/A private XmlChars() {
325N/A }
325N/A
325N/A /**
325N/A * Returns true if the argument, a UCS-4 character code, is valid in
325N/A * XML documents. Unicode characters fit into the low sixteen
325N/A * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
325N/A * characters</em> can be combined to encode UCS-4 characters in
325N/A * documents containing only Unicode. (The <code>char</code> datatype
325N/A * in the Java Programming Language represents Unicode characters,
325N/A * including unpaired surrogates.)
325N/A * <p/>
325N/A * <P> In XML, UCS-4 characters can also be encoded by the use of
325N/A * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
325N/A * happens to refer to a character that is disallowed in XML documents.
325N/A * UCS-4 characters allowed in XML documents can be expressed with
325N/A * one or two Unicode characters.
325N/A *
325N/A * @param ucs4char The 32-bit UCS-4 character being tested.
325N/A */
325N/A static public boolean isChar(int ucs4char) {
325N/A // [2] Char ::= #x0009 | #x000A | #x000D
325N/A // | [#x0020-#xD7FF]
325N/A // ... surrogates excluded!
325N/A // | [#xE000-#xFFFD]
325N/A // | [#x10000-#x10ffff]
325N/A return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
325N/A || ucs4char == 0x000A || ucs4char == 0x0009
325N/A || ucs4char == 0x000D
325N/A || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
325N/A || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
325N/A }
325N/A
325N/A /**
325N/A * Returns true if the character is allowed to be a non-initial
325N/A * character in names according to the XML recommendation.
325N/A *
325N/A * @see #isNCNameChar(char)
325N/A * @see #isLetter(char)
325N/A */
325N/A public static boolean isNameChar(char c) {
325N/A // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
325N/A // | CombiningChar | Extender
325N/A
325N/A if (isLetter2(c))
325N/A return true;
325N/A else if (c == '>')
325N/A return false;
325N/A else if (c == '.' || c == '-' || c == '_' || c == ':'
325N/A || isExtender(c))
325N/A return true;
325N/A else
325N/A return false;
325N/A }
325N/A
325N/A /**
325N/A * Returns true if the character is allowed to be a non-initial
325N/A * character in unscoped names according to the rules of the XML
325N/A * Namespaces proposed recommendation. Except for precluding
325N/A * the colon (used to separate names from their scopes) these
325N/A * characters are just as allowed by the XML recommendation.
325N/A *
325N/A * @see #isNameChar(char)
325N/A * @see #isLetter(char)
325N/A */
325N/A public static boolean isNCNameChar(char c) {
325N/A // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
325N/A // | CombiningChar | Extender
325N/A return c != ':' && isNameChar(c);
325N/A }
325N/A
325N/A /**
325N/A * Returns true if the character is allowed where XML supports
325N/A * whitespace characters, false otherwise.
325N/A */
325N/A public static boolean isSpace(char c) {
325N/A return c == ' ' || c == '\t' || c == '\n' || c == '\r';
325N/A }
325N/A
325N/A
325N/A /*
325N/A * NOTE: java.lang.Character.getType() values are:
325N/A *
325N/A * UNASSIGNED = 0,
325N/A *
325N/A * UPPERCASE_LETTER = 1, // Lu
325N/A * LOWERCASE_LETTER = 2, // Ll
325N/A * TITLECASE_LETTER = 3, // Lt
325N/A * MODIFIER_LETTER = 4, // Lm
325N/A * OTHER_LETTER = 5, // Lo
325N/A * NON_SPACING_MARK = 6, // Mn
325N/A * ENCLOSING_MARK = 7, // Me
325N/A * COMBINING_SPACING_MARK = 8, // Mc
325N/A * DECIMAL_DIGIT_NUMBER = 9, // Nd
325N/A * LETTER_NUMBER = 10, // Nl
325N/A * OTHER_NUMBER = 11, // No
325N/A * SPACE_SEPARATOR = 12, // Zs
325N/A * LINE_SEPARATOR = 13, // Zl
325N/A * PARAGRAPH_SEPARATOR = 14, // Zp
325N/A * CONTROL = 15, // Cc
325N/A * FORMAT = 16, // Cf
325N/A * // 17 reserved for proposed Ci category
325N/A * PRIVATE_USE = 18, // Co
325N/A * SURROGATE = 19, // Cs
325N/A * DASH_PUNCTUATION = 20, // Pd
325N/A * START_PUNCTUATION = 21, // Ps
325N/A * END_PUNCTUATION = 22, // Pe
325N/A * CONNECTOR_PUNCTUATION = 23, // Pc
325N/A * OTHER_PUNCTUATION = 24, // Po
325N/A * MATH_SYMBOL = 25, // Sm
325N/A * CURRENCY_SYMBOL = 26, // Sc
325N/A * MODIFIER_SYMBOL = 27, // Sk
325N/A * OTHER_SYMBOL = 28; // So
325N/A */
325N/A
325N/A /**
325N/A * Returns true if the character is an XML "letter". XML Names must
325N/A * start with Letters or a few other characters, but other characters
325N/A * in names must only satisfy the <em>isNameChar</em> predicate.
325N/A *
325N/A * @see #isNameChar(char)
325N/A * @see #isNCNameChar(char)
325N/A */
325N/A public static boolean isLetter(char c) {
325N/A // [84] Letter ::= BaseChar | Ideographic
325N/A // [85] BaseChar ::= ... too much to repeat
325N/A // [86] Ideographic ::= ... too much to repeat
325N/A
325N/A //
325N/A // Optimize the typical case.
325N/A //
325N/A if (c >= 'a' && c <= 'z')
325N/A return true;
325N/A if (c == '/')
325N/A return false;
325N/A if (c >= 'A' && c <= 'Z')
325N/A return true;
325N/A
325N/A //
325N/A // Since the tables are too ridiculous to use in code,
325N/A // we're using the footnotes here to drive this test.
325N/A //
325N/A switch (Character.getType(c)) {
325N/A // app. B footnote says these are 'name start'
325N/A // chars' ...
325N/A case Character.LOWERCASE_LETTER: // Ll
325N/A case Character.UPPERCASE_LETTER: // Lu
325N/A case Character.OTHER_LETTER: // Lo
325N/A case Character.TITLECASE_LETTER: // Lt
325N/A case Character.LETTER_NUMBER: // Nl
325N/A
325N/A // OK, here we just have some exceptions to check...
325N/A return !isCompatibilityChar(c)
325N/A // per "5.14 of Unicode", rule out some combiners
325N/A && !(c >= 0x20dd && c <= 0x20e0);
325N/A
325N/A default:
325N/A // check for some exceptions: these are "alphabetic"
325N/A return ((c >= 0x02bb && c <= 0x02c1)
325N/A || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
325N/A }
325N/A }
325N/A
325N/A //
325N/A // XML 1.0 discourages "compatibility" characters in names; these
325N/A // were defined to permit passing through some information stored in
325N/A // older non-Unicode character sets. These always have alternative
325N/A // representations in Unicode, e.g. using combining chars.
325N/A //
325N/A private static boolean isCompatibilityChar(char c) {
325N/A // the numerous comparisions here seem unavoidable,
325N/A // but the switch can reduce the number which must
325N/A // actually be executed.
325N/A
325N/A switch ((c >> 8) & 0x0ff) {
325N/A case 0x00:
325N/A // ISO Latin/1 has a few compatibility characters
325N/A return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
325N/A
325N/A case 0x01:
325N/A // as do Latin Extended A and (parts of) B
325N/A return (c >= 0x0132 && c <= 0x0133)
325N/A || (c >= 0x013f && c <= 0x0140)
325N/A || c == 0x0149
325N/A || c == 0x017f
325N/A || (c >= 0x01c4 && c <= 0x01cc)
325N/A || (c >= 0x01f1 && c <= 0x01f3);
325N/A
325N/A case 0x02:
325N/A // some spacing modifiers
325N/A return (c >= 0x02b0 && c <= 0x02b8)
325N/A || (c >= 0x02e0 && c <= 0x02e4);
325N/A
325N/A case 0x03:
325N/A return c == 0x037a; // Greek
325N/A
325N/A case 0x05:
325N/A return c == 0x0587; // Armenian
325N/A
325N/A case 0x0e:
325N/A return c >= 0x0edc && c <= 0x0edd; // Laotian
325N/A
325N/A case 0x11:
325N/A // big chunks of Hangul Jamo are all "compatibility"
325N/A return c == 0x1101
325N/A || c == 0x1104
325N/A || c == 0x1108
325N/A || c == 0x110a
325N/A || c == 0x110d
325N/A || (c >= 0x1113 && c <= 0x113b)
325N/A || c == 0x113d
325N/A || c == 0x113f
325N/A || (c >= 0x1141 && c <= 0x114b)
325N/A || c == 0x114d
325N/A || c == 0x114f
325N/A || (c >= 0x1151 && c <= 0x1153)
325N/A || (c >= 0x1156 && c <= 0x1158)
325N/A || c == 0x1162
325N/A || c == 0x1164
325N/A || c == 0x1166
325N/A || c == 0x1168
325N/A || (c >= 0x116a && c <= 0x116c)
325N/A || (c >= 0x116f && c <= 0x1171)
325N/A || c == 0x1174
325N/A || (c >= 0x1176 && c <= 0x119d)
325N/A || (c >= 0x119f && c <= 0x11a2)
325N/A || (c >= 0x11a9 && c <= 0x11aa)
325N/A || (c >= 0x11ac && c <= 0x11ad)
325N/A || (c >= 0x11b0 && c <= 0x11b6)
325N/A || c == 0x11b9
325N/A || c == 0x11bb
325N/A || (c >= 0x11c3 && c <= 0x11ea)
325N/A || (c >= 0x11ec && c <= 0x11ef)
325N/A || (c >= 0x11f1 && c <= 0x11f8)
325N/A ;
325N/A
325N/A case 0x20:
325N/A return c == 0x207f; // superscript
325N/A
325N/A case 0x21:
325N/A return
325N/A // various letterlike symbols
325N/A c == 0x2102
325N/A || c == 0x2107
325N/A || (c >= 0x210a && c <= 0x2113)
325N/A || c == 0x2115
325N/A || (c >= 0x2118 && c <= 0x211d)
325N/A || c == 0x2124
325N/A || c == 0x2128
325N/A || (c >= 0x212c && c <= 0x212d)
325N/A || (c >= 0x212f && c <= 0x2138)
325N/A
325N/A // most Roman numerals (less 1K, 5K, 10K)
325N/A || (c >= 0x2160 && c <= 0x217f)
325N/A ;
325N/A
325N/A case 0x30:
325N/A // some Hiragana
325N/A return c >= 0x309b && c <= 0x309c;
325N/A
325N/A case 0x31:
325N/A // all Hangul Compatibility Jamo
325N/A return c >= 0x3131 && c <= 0x318e;
325N/A
325N/A case 0xf9:
325N/A case 0xfa:
325N/A case 0xfb:
325N/A case 0xfc:
325N/A case 0xfd:
325N/A case 0xfe:
325N/A case 0xff:
325N/A // the whole "compatibility" area is for that purpose!
325N/A return true;
325N/A
325N/A default:
325N/A // most of Unicode isn't flagged as being for compatibility
325N/A return false;
325N/A }
325N/A }
325N/A
325N/A // guts of isNameChar/isNCNameChar
325N/A private static boolean isLetter2(char c) {
325N/A // [84] Letter ::= BaseChar | Ideographic
325N/A // [85] BaseChar ::= ... too much to repeat
325N/A // [86] Ideographic ::= ... too much to repeat
325N/A // [87] CombiningChar ::= ... too much to repeat
325N/A
325N/A //
325N/A // Optimize the typical case.
325N/A //
325N/A if (c >= 'a' && c <= 'z')
325N/A return true;
325N/A if (c == '>')
325N/A return false;
325N/A if (c >= 'A' && c <= 'Z')
325N/A return true;
325N/A
325N/A //
325N/A // Since the tables are too ridiculous to use in code,
325N/A // we're using the footnotes here to drive this test.
325N/A //
325N/A switch (Character.getType(c)) {
325N/A // app. B footnote says these are 'name start'
325N/A // chars' ...
325N/A case Character.LOWERCASE_LETTER: // Ll
325N/A case Character.UPPERCASE_LETTER: // Lu
325N/A case Character.OTHER_LETTER: // Lo
325N/A case Character.TITLECASE_LETTER: // Lt
325N/A case Character.LETTER_NUMBER: // Nl
325N/A // ... and these are name characters 'other
325N/A // than name start characters'
325N/A case Character.COMBINING_SPACING_MARK: // Mc
325N/A case Character.ENCLOSING_MARK: // Me
325N/A case Character.NON_SPACING_MARK: // Mn
325N/A case Character.MODIFIER_LETTER: // Lm
325N/A case Character.DECIMAL_DIGIT_NUMBER: // Nd
325N/A
325N/A // OK, here we just have some exceptions to check...
325N/A return !isCompatibilityChar(c)
325N/A // per "5.14 of Unicode", rule out some combiners
325N/A && !(c >= 0x20dd && c <= 0x20e0);
325N/A
325N/A default:
325N/A // added a character ...
325N/A return c == 0x0387;
325N/A }
325N/A }
325N/A
325N/A private static boolean isDigit(char c) {
325N/A // [88] Digit ::= ...
325N/A
325N/A //
325N/A // java.lang.Character.isDigit is correct from the XML point
325N/A // of view except that it allows "fullwidth" digits.
325N/A //
325N/A return Character.isDigit(c)
325N/A && !((c >= 0xff10) && (c <= 0xff19));
325N/A }
325N/A
325N/A private static boolean isExtender(char c) {
325N/A // [89] Extender ::= ...
325N/A return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
325N/A || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
325N/A || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
325N/A || (c >= 0x309d && c <= 0x309e)
325N/A || (c >= 0x30fc && c <= 0x30fe)
325N/A ;
325N/A }
325N/A}