325N/A * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved. 325N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 325N/A * This code is free software; you can redistribute it and/or modify it 325N/A * under the terms of the GNU General Public License version 2 only, as 325N/A * published by the Free Software Foundation. Oracle designates this 325N/A * particular file as subject to the "Classpath" exception as provided 325N/A * by Oracle in the LICENSE file that accompanied this code. 325N/A * This code is distributed in the hope that it will be useful, but WITHOUT 325N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 325N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 325N/A * version 2 for more details (a copy is included in the LICENSE file that 325N/A * accompanied this code). 325N/A * You should have received a copy of the GNU General Public License version 325N/A * 2 along with this work; if not, write to the Free Software Foundation, 325N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 325N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 325N/A * or visit www.oracle.com if you need additional information or have any 325N/A * Methods in this class are used to determine whether characters may 325N/A * appear in certain roles in XML documents. Such methods are used 325N/A * both to parse and to create such documents. 325N/A * @author David Brownell 325N/A * @version 1.1, 00/08/05 325N/A // can't construct instances 325N/A * Returns true if the argument, a UCS-4 character code, is valid in 325N/A * XML documents. Unicode characters fit into the low sixteen 325N/A * bits of a UCS-4 character, and pairs of Unicode <em>surrogate 325N/A * characters</em> can be combined to encode UCS-4 characters in 325N/A * documents containing only Unicode. (The <code>char</code> datatype 325N/A * in the Java Programming Language represents Unicode characters, 325N/A * including unpaired surrogates.) 325N/A * <P> In XML, UCS-4 characters can also be encoded by the use of 325N/A * <em>character references</em> such as <b>&#x12345678;</b>, which 325N/A * happens to refer to a character that is disallowed in XML documents. 325N/A * UCS-4 characters allowed in XML documents can be expressed with 325N/A * one or two Unicode characters. 325N/A * @param ucs4char The 32-bit UCS-4 character being tested. 325N/A // [2] Char ::= #x0009 | #x000A | #x000D 325N/A // ... surrogates excluded! 325N/A // | [#x10000-#x10ffff] 325N/A * Returns true if the character is allowed to be a non-initial 325N/A * character in names according to the XML recommendation. 325N/A * @see #isNCNameChar(char) 325N/A // [4] NameChar ::= Letter | Digit | '.' | '_' | ':' 325N/A // | CombiningChar | Extender 325N/A else if (c ==
'.' || c ==
'-' || c ==
'_' || c ==
':' 325N/A * Returns true if the character is allowed to be a non-initial 325N/A * character in unscoped names according to the rules of the XML 325N/A * Namespaces proposed recommendation. Except for precluding 325N/A * the colon (used to separate names from their scopes) these 325N/A * characters are just as allowed by the XML recommendation. 325N/A * @see #isNameChar(char) 325N/A // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' 325N/A // | CombiningChar | Extender 325N/A * Returns true if the character is allowed where XML supports 325N/A * whitespace characters, false otherwise. 325N/A return c ==
' ' || c ==
'\t' || c ==
'\n' || c ==
'\r';
325N/A * NOTE: java.lang.Character.getType() values are: 325N/A * UPPERCASE_LETTER = 1, // Lu 325N/A * LOWERCASE_LETTER = 2, // Ll 325N/A * TITLECASE_LETTER = 3, // Lt 325N/A * MODIFIER_LETTER = 4, // Lm 325N/A * OTHER_LETTER = 5, // Lo 325N/A * NON_SPACING_MARK = 6, // Mn 325N/A * ENCLOSING_MARK = 7, // Me 325N/A * COMBINING_SPACING_MARK = 8, // Mc 325N/A * DECIMAL_DIGIT_NUMBER = 9, // Nd 325N/A * LETTER_NUMBER = 10, // Nl 325N/A * OTHER_NUMBER = 11, // No 325N/A * SPACE_SEPARATOR = 12, // Zs 325N/A * LINE_SEPARATOR = 13, // Zl 325N/A * PARAGRAPH_SEPARATOR = 14, // Zp 325N/A * // 17 reserved for proposed Ci category 325N/A * PRIVATE_USE = 18, // Co 325N/A * SURROGATE = 19, // Cs 325N/A * DASH_PUNCTUATION = 20, // Pd 325N/A * START_PUNCTUATION = 21, // Ps 325N/A * END_PUNCTUATION = 22, // Pe 325N/A * CONNECTOR_PUNCTUATION = 23, // Pc 325N/A * OTHER_PUNCTUATION = 24, // Po 325N/A * MATH_SYMBOL = 25, // Sm 325N/A * CURRENCY_SYMBOL = 26, // Sc 325N/A * MODIFIER_SYMBOL = 27, // Sk 325N/A * OTHER_SYMBOL = 28; // So 325N/A * Returns true if the character is an XML "letter". XML Names must 325N/A * start with Letters or a few other characters, but other characters 325N/A * in names must only satisfy the <em>isNameChar</em> predicate. 325N/A * @see #isNameChar(char) 325N/A * @see #isNCNameChar(char) 325N/A // [84] Letter ::= BaseChar | Ideographic 325N/A // [85] BaseChar ::= ... too much to repeat 325N/A // [86] Ideographic ::= ... too much to repeat 325N/A // Optimize the typical case. 325N/A if (c >=
'a' && c <=
'z')
325N/A if (c >=
'A' && c <=
'Z')
325N/A // Since the tables are too ridiculous to use in code, 325N/A // we're using the footnotes here to drive this test. 325N/A // app. B footnote says these are 'name start' 325N/A // OK, here we just have some exceptions to check... 325N/A // per "5.14 of Unicode", rule out some combiners 325N/A && !(c >=
0x20dd && c <=
0x20e0);
325N/A // check for some exceptions: these are "alphabetic" 325N/A return ((c >=
0x02bb && c <=
0x02c1)
325N/A || c ==
0x0559 || c ==
0x06e5 || c ==
0x06e6);
325N/A // XML 1.0 discourages "compatibility" characters in names; these 325N/A // were defined to permit passing through some information stored in 325N/A // older non-Unicode character sets. These always have alternative 325N/A // representations in Unicode, e.g. using combining chars. 325N/A // the numerous comparisions here seem unavoidable, 325N/A // but the switch can reduce the number which must 325N/A // actually be executed. 325N/A switch ((c >>
8) &
0x0ff) {
325N/A // ISO Latin/1 has a few compatibility characters 325N/A return c ==
0x00aa || c ==
0x00b5 || c ==
0x00ba;
325N/A // as do Latin Extended A and (parts of) B 325N/A return (c >=
0x0132 && c <=
0x0133)
325N/A || (c >=
0x013f && c <=
0x0140)
325N/A || (c >=
0x01c4 && c <=
0x01cc)
325N/A || (c >=
0x01f1 && c <=
0x01f3);
325N/A // some spacing modifiers 325N/A return (c >=
0x02b0 && c <=
0x02b8)
325N/A || (c >=
0x02e0 && c <=
0x02e4);
325N/A return c ==
0x037a;
// Greek 325N/A return c ==
0x0587;
// Armenian 325N/A return c >=
0x0edc && c <=
0x0edd;
// Laotian 325N/A // big chunks of Hangul Jamo are all "compatibility" 325N/A || (c >=
0x1113 && c <=
0x113b)
325N/A || (c >=
0x1141 && c <=
0x114b)
325N/A || (c >=
0x1151 && c <=
0x1153)
325N/A || (c >=
0x1156 && c <=
0x1158)
325N/A || (c >=
0x116a && c <=
0x116c)
325N/A || (c >=
0x116f && c <=
0x1171)
325N/A || (c >=
0x1176 && c <=
0x119d)
325N/A || (c >=
0x119f && c <=
0x11a2)
325N/A || (c >=
0x11a9 && c <=
0x11aa)
325N/A || (c >=
0x11ac && c <=
0x11ad)
325N/A || (c >=
0x11b0 && c <=
0x11b6)
325N/A || (c >=
0x11c3 && c <=
0x11ea)
325N/A || (c >=
0x11ec && c <=
0x11ef)
325N/A || (c >=
0x11f1 && c <=
0x11f8)
325N/A return c ==
0x207f;
// superscript 325N/A // various letterlike symbols 325N/A || (c >=
0x210a && c <=
0x2113)
325N/A || (c >=
0x2118 && c <=
0x211d)
325N/A || (c >=
0x212c && c <=
0x212d)
325N/A || (c >=
0x212f && c <=
0x2138)
325N/A // most Roman numerals (less 1K, 5K, 10K) 325N/A || (c >=
0x2160 && c <=
0x217f)
325N/A return c >=
0x309b && c <=
0x309c;
325N/A // all Hangul Compatibility Jamo 325N/A return c >=
0x3131 && c <=
0x318e;
325N/A // the whole "compatibility" area is for that purpose! 325N/A // most of Unicode isn't flagged as being for compatibility 325N/A // [84] Letter ::= BaseChar | Ideographic 325N/A // [85] BaseChar ::= ... too much to repeat 325N/A // [86] Ideographic ::= ... too much to repeat 325N/A // [87] CombiningChar ::= ... too much to repeat 325N/A // Optimize the typical case. 325N/A if (c >=
'a' && c <=
'z')
325N/A if (c >=
'A' && c <=
'Z')
325N/A // Since the tables are too ridiculous to use in code, 325N/A // we're using the footnotes here to drive this test. 325N/A // app. B footnote says these are 'name start' 325N/A // ... and these are name characters 'other 325N/A // than name start characters' 325N/A // OK, here we just have some exceptions to check... 325N/A // per "5.14 of Unicode", rule out some combiners 325N/A && !(c >=
0x20dd && c <=
0x20e0);
325N/A // added a character ... 325N/A // java.lang.Character.isDigit is correct from the XML point 325N/A // of view except that it allows "fullwidth" digits. 325N/A && !((c >=
0xff10) && (c <=
0xff19));
325N/A // [89] Extender ::= ... 325N/A return c ==
0x00b7 || c ==
0x02d0 || c ==
0x02d1 || c ==
0x0387 325N/A || c ==
0x0640 || c ==
0x0e46 || c ==
0x0ec6 325N/A || c ==
0x3005 || (c >=
0x3031 && c <=
0x3035)
325N/A || (c >=
0x309d && c <=
0x309e)
325N/A || (c >=
0x30fc && c <=
0x30fe)