/* * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * * * * The original version of this source code and documentation is copyrighted * * and owned by IBM, These materials are provided under terms of a License * * Agreement between IBM and Sun. This technology is protected by multiple * * US and International patents. This notice and attribution to IBM may not * * to removed. * ******************************************************************************* */ package sun.text.normalizer; import java.io.BufferedInputStream; import java.io.InputStream; import java.io.IOException; import java.util.MissingResourceException; /** *
Internal class used for Unicode character property database.
*This classes store binary data read from uprops.icu. * It does not have the capability to parse the data into more high-level * information. It only returns bytes of information when required.
*Due to the form most commonly used for retrieval, array of char is used * to store the binary data.
*UCharacterPropertyDB also contains information on accessing indexes to * significant points in the binary data.
*Responsibility for molding the binary data into more meaning form lies on * UCharacter.
* @author Syn Wee Quek * @since release 2.1, february 1st 2002 */ public final class UCharacterProperty { // public data members ----------------------------------------------- /** * Trie data */ public CharTrie m_trie_; /** * Optimization * CharTrie index array */ public char[] m_trieIndex_; /** * Optimization * CharTrie data array */ public char[] m_trieData_; /** * Optimization * CharTrie data offset */ public int m_trieInitialValue_; /** * Unicode version */ public VersionInfo m_unicodeVersion_; // uprops.h enum UPropertySource --------------------------------------- *** /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC=2; /** One more than the highest UPropertySource (SRC_) constant. */ public static final int SRC_COUNT=9; // public methods ---------------------------------------------------- /** * Java friends implementation */ public void setIndexData(CharTrie.FriendAgent friendagent) { m_trieIndex_ = friendagent.getPrivateIndex(); m_trieData_ = friendagent.getPrivateData(); m_trieInitialValue_ = friendagent.getPrivateInitialValue(); } /** * Gets the property value at the index. * This is optimized. * Note this is alittle different from CharTrie the index m_trieData_ * is never negative. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { // BMP codepoint 0000..D7FF or DC00..FFFF // optimized try { // using try for ch < 0 is faster than using an if statement return m_trieData_[ (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_) + (ch & Trie.INDEX_STAGE_3_MASK_)]; } catch (ArrayIndexOutOfBoundsException e) { return m_trieInitialValue_; } } if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { // lead surrogate D800..DBFF return m_trieData_[ (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] << Trie.INDEX_STAGE_2_SHIFT_) + (ch & Trie.INDEX_STAGE_3_MASK_)]; } if (ch <= UTF16.CODEPOINT_MAX_VALUE) { // supplementary code point 10000..10FFFF // look at the construction of supplementary characters // trail forms the ends of it. return m_trie_.getSurrogateValue( UTF16.getLeadSurrogate(ch), (char)(ch & Trie.SURROGATE_MASK_)); } // ch is out of bounds // return m_dataOffset_ if there is an error, in this case we return // the default value: m_initialValue_ // we cannot assume that m_initialValue_ is at offset 0 // this is for optimization. return m_trieInitialValue_; // this all is an inlined form of return m_trie_.getCodePointValue(ch); } /** * Getting the unsigned numeric value of a character embedded in the property * argument * @param prop the character * @return unsigned numberic value */ public static int getUnsignedValue(int prop) { return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; } /** * Gets the unicode additional properties. * C version getUnicodeProperties. * @param codepoint codepoint whose additional properties is to be * retrieved * @param column * @return unicode properties */ public int getAdditional(int codepoint, int column) { if (column == -1) { return getProperty(codepoint); } if (column < 0 || column >= m_additionalColumnsCount_) { return 0; } return m_additionalVectors_[ m_additionalTrie_.getCodePointValue(codepoint) + column]; } /** *Get the "age" of the code point.
*The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character.
*This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters.
*The data is from the UCD file DerivedAge.txt.
*This API does not check the validity of the codepoint.
* @param codepoint The code point. * @return the Unicode version number */ public VersionInfo getAge(int codepoint) { int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; return VersionInfo.getInstance( (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0); } /** * Forms a supplementary code point from the argument character