text/normalizer/UCharacter.java

0N/A/*
2362N/A * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0N/A *
0N/A * This code is free software; you can redistribute it and/or modify it
0N/A * under the terms of the GNU General Public License version 2 only, as
2362N/A * published by the Free Software Foundation.  Oracle designates this
0N/A * particular file as subject to the "Classpath" exception as provided
2362N/A * by Oracle in the LICENSE file that accompanied this code.
0N/A *
0N/A * This code is distributed in the hope that it will be useful, but WITHOUT
0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0N/A * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
0N/A * version 2 for more details (a copy is included in the LICENSE file that
0N/A * accompanied this code).
0N/A *
0N/A * You should have received a copy of the GNU General Public License version
0N/A * 2 along with this work; if not, write to the Free Software Foundation,
0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0N/A *
2362N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2362N/A * or visit www.oracle.com if you need additional information or have any
2362N/A * questions.
0N/A */
0N/A/*
0N/A *******************************************************************************
1091N/A * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
0N/A *                                                                             *
0N/A * The original version of this source code and documentation is copyrighted   *
0N/A * and owned by IBM, These materials are provided under terms of a License     *
0N/A * Agreement between IBM and Sun. This technology is protected by multiple     *
0N/A * US and International patents. This notice and attribution to IBM may not    *
0N/A * to removed.                                                                 *
0N/A *******************************************************************************
0N/A */
0N/A
0N/Apackage sun.text.normalizer;
0N/A
1091N/Aimport java.io.IOException;
1091N/Aimport java.util.MissingResourceException;
0N/A
0N/A/**
0N/A * <p>
0N/A * The UCharacter class provides extensions to the
1091N/A * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
0N/A * java.lang.Character</a> class. These extensions provide support for
1091N/A * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
0N/A * class, provide support for supplementary characters (those with code
0N/A * points above U+FFFF).
1091N/A * Each ICU release supports the latest version of Unicode available at that time.
0N/A * </p>
0N/A * <p>
0N/A * Code points are represented in these API using ints. While it would be
0N/A * more convenient in Java to have a separate primitive datatype for them,
0N/A * ints suffice in the meantime.
0N/A * </p>
0N/A * <p>
0N/A * To use this class please add the jar file name icu4j.jar to the
0N/A * class path, since it contains data files which supply the information used
0N/A * by this file.<br>
0N/A * E.g. In Windows <br>
0N/A * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
0N/A * Otherwise, another method would be to copy the files uprops.dat and
0N/A * unames.icu from the icu4j source subdirectory
0N/A * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
0N/A * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
0N/A * </p>
0N/A * <p>
1091N/A * Aside from the additions for UTF-16 support, and the updated Unicode
0N/A * properties, the main differences between UCharacter and Character are:
0N/A * <ul>
0N/A * <li> UCharacter is not designed to be a char wrapper and does not have
0N/A *      APIs to which involves management of that single char.<br>
0N/A *      These include:
0N/A *      <ul>
0N/A *        <li> char charValue(),
0N/A *        <li> int compareTo(java.lang.Character, java.lang.Character), etc.
0N/A *      </ul>
1091N/A * <li> UCharacter does not include Character APIs that are deprecated, nor
0N/A *      does it include the Java-specific character information, such as
0N/A *      boolean isJavaIdentifierPart(char ch).
0N/A * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
0N/A *      values '10' - '35'. UCharacter also does this in digit and
0N/A *      getNumericValue, to adhere to the java semantics of these
0N/A *      methods.  New methods unicodeDigit, and
0N/A *      getUnicodeNumericValue do not treat the above code points
0N/A *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
0N/A * </ul>
0N/A * <p>
0N/A * Further detail differences can be determined from the program
1091N/A *        <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
0N/A *        com.ibm.icu.dev.test.lang.UCharacterCompare</a>
0N/A * </p>
0N/A * <p>
1091N/A * In addition to Java compatibility functions, which calculate derived properties,
1091N/A * this API provides low-level access to the Unicode Character Database.
1091N/A * </p>
1091N/A * <p>
1091N/A * Unicode assigns each code point (not just assigned character) values for
1091N/A * many properties.
1091N/A * Most of them are simple boolean flags, or constants from a small enumerated list.
1091N/A * For some properties, values are strings or other relatively more complex types.
1091N/A * </p>
1091N/A * <p>
1091N/A * For more information see
1091N/A * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
1091N/A * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
1091N/A * </p>
1091N/A * <p>
1091N/A * There are also functions that provide easy migration from C/POSIX functions
1091N/A * like isblank(). Their use is generally discouraged because the C/POSIX
1091N/A * standards do not define their semantics beyond the ASCII range, which means
1091N/A * that different implementations exhibit very different behavior.
1091N/A * Instead, Unicode properties should be used directly.
1091N/A * </p>
1091N/A * <p>
1091N/A * There are also only a few, broad C/POSIX character classes, and they tend
1091N/A * to be used for conflicting purposes. For example, the "isalpha()" class
1091N/A * is sometimes used to determine word boundaries, while a more sophisticated
1091N/A * approach would at least distinguish initial letters from continuation
1091N/A * characters (the latter including combining marks).
1091N/A * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
1091N/A * Another example: There is no "istitle()" class for titlecase characters.
1091N/A * </p>
1091N/A * <p>
1091N/A * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
1091N/A * ICU implements them according to the Standard Recommendations in
1091N/A * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
1091N/A * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
1091N/A * </p>
1091N/A * <p>
1091N/A * API access for C/POSIX character classes is as follows:
1091N/A * - alpha:     isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
1091N/A * - lower:     isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
1091N/A * - upper:     isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
1091N/A * - punct:     ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
1091N/A * - digit:     isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
1091N/A * - xdigit:    hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
1091N/A * - alnum:     hasBinaryProperty(c, UProperty.POSIX_ALNUM)
1091N/A * - space:     isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
1091N/A * - blank:     hasBinaryProperty(c, UProperty.POSIX_BLANK)
1091N/A * - cntrl:     getType(c)==CONTROL
1091N/A * - graph:     hasBinaryProperty(c, UProperty.POSIX_GRAPH)
1091N/A * - print:     hasBinaryProperty(c, UProperty.POSIX_PRINT)
1091N/A * </p>
1091N/A * <p>
1091N/A * The C/POSIX character classes are also available in UnicodeSet patterns,
1091N/A * using patterns like [:graph:] or \p{graph}.
1091N/A * </p>
1091N/A * <p>
1091N/A * Note: There are several ICU (and Java) whitespace functions.
1091N/A * Comparison:
1091N/A * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
1091N/A *       most of general categories "Z" (separators) + most whitespace ISO controls
1091N/A *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
1091N/A * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
1091N/A * - isSpaceChar: just Z (including no-break spaces)
1091N/A * </p>
1091N/A * <p>
0N/A * This class is not subclassable
0N/A * </p>
0N/A * @author Syn Wee Quek
0N/A * @stable ICU 2.1
0N/A * @see com.ibm.icu.lang.UCharacterEnums
0N/A */
0N/A
0N/Apublic final class UCharacter
0N/A{
0N/A
0N/A    /**
0N/A     * Numeric Type constants.
0N/A     * @see UProperty#NUMERIC_TYPE
0N/A     * @stable ICU 2.4
0N/A     */
0N/A    public static interface NumericType
0N/A    {
0N/A        /**
0N/A         * @stable ICU 2.4
0N/A         */
0N/A        public static final int DECIMAL = 1;
0N/A    }
0N/A
0N/A    // public data members -----------------------------------------------
0N/A
0N/A    /**
0N/A     * The lowest Unicode code point value.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
0N/A
0N/A    /**
0N/A     * The highest Unicode code point value (scalar value) according to the
0N/A     * Unicode Standard.
0N/A     * This is a 21-bit value (21 bits, rounded up).<br>
0N/A     * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
0N/A
0N/A    /**
0N/A     * The minimum value for Supplementary code points
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int SUPPLEMENTARY_MIN_VALUE =
0N/A        UTF16.SUPPLEMENTARY_MIN_VALUE;
0N/A
0N/A    // public methods ----------------------------------------------------
0N/A
0N/A    /**
0N/A     * Retrieves the numeric value of a decimal digit code point.
0N/A     * <br>This method observes the semantics of
0N/A     * <code>java.lang.Character.digit()</code>.  Note that this
0N/A     * will return positive values for code points for which isDigit
0N/A     * returns false, just like java.lang.Character.
0N/A     * <br><em>Semantic Change:</em> In release 1.3.1 and
0N/A     * prior, this did not treat the European letters as having a
0N/A     * digit value, and also treated numeric letters and other numbers as
0N/A     * digits.
0N/A     * This has been changed to conform to the java semantics.
0N/A     * <br>A code point is a valid digit if and only if:
0N/A     * <ul>
0N/A     *   <li>ch is a decimal digit or one of the european letters, and
0N/A     *   <li>the value of ch is less than the specified radix.
0N/A     * </ul>
0N/A     * @param ch the code point to query
0N/A     * @param radix the radix
0N/A     * @return the numeric value represented by the code point in the
0N/A     * specified radix, or -1 if the code point is not a decimal digit
0N/A     * or if its value is too large for the radix
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static int digit(int ch, int radix)
0N/A    {
0N/A        // when ch is out of bounds getProperty == 0
0N/A        int props = getProperty(ch);
1091N/A        int value;
1091N/A        if (getNumericType(props) == NumericType.DECIMAL) {
1091N/A            value = UCharacterProperty.getUnsignedValue(props);
1091N/A        } else {
1091N/A            value = getEuropeanDigit(ch);
0N/A        }
1091N/A        return (0 <= value && value < radix) ? value : -1;
0N/A    }
0N/A
0N/A    /**
0N/A     * Returns the Bidirection property of a code point.
0N/A     * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
0N/A     * property.<br>
0N/A     * Result returned belongs to the interface
0N/A     * <a href=UCharacterDirection.html>UCharacterDirection</a>
0N/A     * @param ch the code point to be determined its direction
0N/A     * @return direction constant from UCharacterDirection.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static int getDirection(int ch)
0N/A    {
1091N/A        return gBdp.getClass(ch);
0N/A    }
0N/A
0N/A    /**
1091N/A     * Returns a code point corresponding to the two UTF16 characters.
1091N/A     * @param lead the lead char
1091N/A     * @param trail the trail char
1091N/A     * @return code point if surrogate characters are valid.
1091N/A     * @exception IllegalArgumentException thrown when argument characters do
1091N/A     *            not form a valid codepoint
0N/A     * @stable ICU 2.1
0N/A     */
1091N/A    public static int getCodePoint(char lead, char trail)
0N/A    {
1091N/A        if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
1091N/A            return UCharacterProperty.getRawSupplementary(lead, trail);
0N/A        }
1091N/A        throw new IllegalArgumentException("Illegal surrogate characters");
0N/A    }
0N/A
0N/A    /**
0N/A     * <p>Get the "age" of the code point.</p>
0N/A     * <p>The "age" is the Unicode version when the code point was first
0N/A     * designated (as a non-character or for Private Use) or assigned a
0N/A     * character.
0N/A     * <p>This can be useful to avoid emitting code points to receiving
0N/A     * processes that do not accept newer characters.</p>
0N/A     * <p>The data is from the UCD file DerivedAge.txt.</p>
0N/A     * @param ch The code point.
0N/A     * @return the Unicode version number
0N/A     * @stable ICU 2.6
0N/A     */
0N/A    public static VersionInfo getAge(int ch)
0N/A    {
0N/A        if (ch < MIN_VALUE || ch > MAX_VALUE) {
0N/A        throw new IllegalArgumentException("Codepoint out of bounds");
0N/A        }
0N/A        return PROPERTY_.getAge(ch);
0N/A    }
0N/A
0N/A    // private variables -------------------------------------------------
0N/A
0N/A    /**
0N/A     * Database storing the sets of character property
0N/A     */
0N/A    private static final UCharacterProperty PROPERTY_;
0N/A    /**
0N/A     * For optimization
0N/A     */
0N/A    private static final char[] PROPERTY_TRIE_INDEX_;
0N/A    private static final char[] PROPERTY_TRIE_DATA_;
0N/A    private static final int PROPERTY_INITIAL_VALUE_;
0N/A
1091N/A    private static final UBiDiProps gBdp;
1091N/A
0N/A    // block to initialise character property database
0N/A    static
0N/A    {
0N/A        try
0N/A        {
1091N/A            PROPERTY_ = UCharacterProperty.getInstance();
1091N/A            PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
1091N/A            PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
1091N/A            PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
0N/A        }
0N/A        catch (Exception e)
0N/A        {
1091N/A            throw new MissingResourceException(e.getMessage(),"","");
0N/A        }
0N/A
1091N/A        UBiDiProps bdp;
1091N/A        try {
1091N/A            bdp=UBiDiProps.getSingleton();
1091N/A        } catch(IOException e) {
1091N/A            bdp=UBiDiProps.getDummy();
1091N/A        }
1091N/A        gBdp=bdp;
1091N/A    }
0N/A
0N/A    /**
0N/A     * Shift to get numeric type
0N/A     */
1091N/A    private static final int NUMERIC_TYPE_SHIFT_ = 5;
0N/A    /**
0N/A     * Mask to get numeric type
0N/A     */
0N/A    private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
0N/A
0N/A    // private methods ---------------------------------------------------
0N/A
0N/A    /**
0N/A     * Getting the digit values of characters like 'A' - 'Z', normal,
0N/A     * half-width and full-width. This method assumes that the other digit
0N/A     * characters are checked by the calling method.
0N/A     * @param ch character to test
0N/A     * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
0N/A     *         its corresponding digit will be returned.
0N/A     */
0N/A    private static int getEuropeanDigit(int ch) {
0N/A        if ((ch > 0x7a && ch < 0xff21)
0N/A            || ch < 0x41 || (ch > 0x5a && ch < 0x61)
1280N/A            || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
0N/A            return -1;
0N/A        }
0N/A        if (ch <= 0x7a) {
0N/A            // ch >= 0x41 or ch < 0x61
0N/A            return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
0N/A        }
0N/A        // ch >= 0xff21
0N/A        if (ch <= 0xff3a) {
0N/A            return ch + 10 - 0xff21;
0N/A        }
0N/A        // ch >= 0xff41 && ch <= 0xff5a
0N/A        return ch + 10 - 0xff41;
0N/A    }
0N/A
0N/A    /**
0N/A     * Gets the numeric type of the property argument
0N/A     * @param props 32 bit property
0N/A     * @return the numeric type
0N/A     */
0N/A    private static int getNumericType(int props)
0N/A    {
0N/A        return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
0N/A    }
0N/A
0N/A    /**
0N/A     * Gets the property value at the index.
0N/A     * This is optimized.
0N/A     * Note this is alittle different from CharTrie the index m_trieData_
0N/A     * is never negative.
0N/A     * This is a duplicate of UCharacterProperty.getProperty. For optimization
0N/A     * purposes, this method calls the trie data directly instead of through
0N/A     * UCharacterProperty.getProperty.
0N/A     * @param ch code point whose property value is to be retrieved
0N/A     * @return property value of code point
0N/A     * @stable ICU 2.6
0N/A     */
1091N/A    private static final int getProperty(int ch)
0N/A    {
0N/A        if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
0N/A            || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
0N/A                && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
1091N/A            // BMP codepoint 0000..D7FF or DC00..FFFF
1091N/A            try { // using try for ch < 0 is faster than using an if statement
1091N/A                return PROPERTY_TRIE_DATA_[
0N/A                              (PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
1091N/A                              + (ch & 0x1f)];
0N/A            } catch (ArrayIndexOutOfBoundsException e) {
0N/A                return PROPERTY_INITIAL_VALUE_;
0N/A            }
0N/A        }
0N/A        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
1091N/A            // lead surrogate D800..DBFF
1091N/A            return PROPERTY_TRIE_DATA_[
0N/A                              (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
1091N/A                              + (ch & 0x1f)];
0N/A        }
0N/A        // for optimization
0N/A        if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
1091N/A            // supplementary code point 10000..10FFFF
0N/A            // look at the construction of supplementary characters
0N/A            // trail forms the ends of it.
1091N/A            return PROPERTY_.m_trie_.getSurrogateValue(
0N/A                                      UTF16.getLeadSurrogate(ch),
1091N/A                                      (char)(ch & 0x3ff));
0N/A        }
0N/A        // return m_dataOffset_ if there is an error, in this case we return
0N/A        // the default value: m_initialValue_
0N/A        // we cannot assume that m_initialValue_ is at offset 0
0N/A        // this is for optimization.
0N/A        return PROPERTY_INITIAL_VALUE_;
0N/A    }
1091N/A
0N/A}