0N/A/*
2362N/A * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0N/A *
0N/A * This code is free software; you can redistribute it and/or modify it
0N/A * under the terms of the GNU General Public License version 2 only, as
2362N/A * published by the Free Software Foundation. Oracle designates this
0N/A * particular file as subject to the "Classpath" exception as provided
2362N/A * by Oracle in the LICENSE file that accompanied this code.
0N/A *
0N/A * This code is distributed in the hope that it will be useful, but WITHOUT
0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
0N/A * version 2 for more details (a copy is included in the LICENSE file that
0N/A * accompanied this code).
0N/A *
0N/A * You should have received a copy of the GNU General Public License version
0N/A * 2 along with this work; if not, write to the Free Software Foundation,
0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0N/A *
2362N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2362N/A * or visit www.oracle.com if you need additional information or have any
2362N/A * questions.
0N/A */
0N/A/*
0N/A *******************************************************************************
1091N/A * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
0N/A * *
0N/A * The original version of this source code and documentation is copyrighted *
0N/A * and owned by IBM, These materials are provided under terms of a License *
0N/A * Agreement between IBM and Sun. This technology is protected by multiple *
0N/A * US and International patents. This notice and attribution to IBM may not *
0N/A * to removed. *
0N/A *******************************************************************************
0N/A */
0N/A
0N/Apackage sun.text.normalizer;
0N/A
0N/Aimport java.io.BufferedInputStream;
0N/Aimport java.io.InputStream;
0N/Aimport java.io.IOException;
1091N/Aimport java.util.MissingResourceException;
0N/A
0N/A/**
0N/A* <p>Internal class used for Unicode character property database.</p>
0N/A* <p>This classes store binary data read from uprops.icu.
0N/A* It does not have the capability to parse the data into more high-level
0N/A* information. It only returns bytes of information when required.</p>
0N/A* <p>Due to the form most commonly used for retrieval, array of char is used
0N/A* to store the binary data.</p>
0N/A* <p>UCharacterPropertyDB also contains information on accessing indexes to
0N/A* significant points in the binary data.</p>
0N/A* <p>Responsibility for molding the binary data into more meaning form lies on
0N/A* <a href=UCharacter.html>UCharacter</a>.</p>
0N/A* @author Syn Wee Quek
0N/A* @since release 2.1, february 1st 2002
0N/A*/
0N/A
1091N/Apublic final class UCharacterProperty
0N/A{
0N/A // public data members -----------------------------------------------
0N/A
0N/A /**
0N/A * Trie data
0N/A */
0N/A public CharTrie m_trie_;
0N/A /**
0N/A * Optimization
0N/A * CharTrie index array
0N/A */
0N/A public char[] m_trieIndex_;
0N/A /**
0N/A * Optimization
0N/A * CharTrie data array
0N/A */
0N/A public char[] m_trieData_;
0N/A /**
0N/A * Optimization
0N/A * CharTrie data offset
0N/A */
0N/A public int m_trieInitialValue_;
0N/A /**
0N/A * Unicode version
0N/A */
0N/A public VersionInfo m_unicodeVersion_;
1091N/A
1091N/A // uprops.h enum UPropertySource --------------------------------------- ***
0N/A
1091N/A /** From uchar.c/uprops.icu properties vectors trie */
1091N/A public static final int SRC_PROPSVEC=2;
1091N/A /** One more than the highest UPropertySource (SRC_) constant. */
1091N/A public static final int SRC_COUNT=9;
0N/A
0N/A // public methods ----------------------------------------------------
0N/A
0N/A /**
0N/A * Java friends implementation
0N/A */
0N/A public void setIndexData(CharTrie.FriendAgent friendagent)
0N/A {
0N/A m_trieIndex_ = friendagent.getPrivateIndex();
0N/A m_trieData_ = friendagent.getPrivateData();
0N/A m_trieInitialValue_ = friendagent.getPrivateInitialValue();
0N/A }
0N/A
0N/A /**
0N/A * Gets the property value at the index.
0N/A * This is optimized.
0N/A * Note this is alittle different from CharTrie the index m_trieData_
0N/A * is never negative.
0N/A * @param ch code point whose property value is to be retrieved
0N/A * @return property value of code point
0N/A */
1091N/A public final int getProperty(int ch)
0N/A {
0N/A if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
0N/A || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
0N/A && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
1091N/A // BMP codepoint 0000..D7FF or DC00..FFFF
0N/A // optimized
1091N/A try { // using try for ch < 0 is faster than using an if statement
1091N/A return m_trieData_[
0N/A (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
0N/A << Trie.INDEX_STAGE_2_SHIFT_)
1091N/A + (ch & Trie.INDEX_STAGE_3_MASK_)];
0N/A } catch (ArrayIndexOutOfBoundsException e) {
1091N/A return m_trieInitialValue_;
0N/A }
0N/A }
0N/A if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
1091N/A // lead surrogate D800..DBFF
1091N/A return m_trieData_[
0N/A (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
0N/A + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
0N/A << Trie.INDEX_STAGE_2_SHIFT_)
1091N/A + (ch & Trie.INDEX_STAGE_3_MASK_)];
0N/A }
0N/A if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
1091N/A // supplementary code point 10000..10FFFF
0N/A // look at the construction of supplementary characters
0N/A // trail forms the ends of it.
1091N/A return m_trie_.getSurrogateValue(
0N/A UTF16.getLeadSurrogate(ch),
1091N/A (char)(ch & Trie.SURROGATE_MASK_));
0N/A }
1091N/A // ch is out of bounds
0N/A // return m_dataOffset_ if there is an error, in this case we return
0N/A // the default value: m_initialValue_
0N/A // we cannot assume that m_initialValue_ is at offset 0
0N/A // this is for optimization.
1091N/A return m_trieInitialValue_;
0N/A
1091N/A // this all is an inlined form of return m_trie_.getCodePointValue(ch);
0N/A }
0N/A
0N/A /**
1091N/A * Getting the unsigned numeric value of a character embedded in the property
1091N/A * argument
1091N/A * @param prop the character
1091N/A * @return unsigned numberic value
0N/A */
1091N/A public static int getUnsignedValue(int prop)
0N/A {
1091N/A return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
0N/A }
0N/A
0N/A /**
0N/A * Gets the unicode additional properties.
0N/A * C version getUnicodeProperties.
0N/A * @param codepoint codepoint whose additional properties is to be
0N/A * retrieved
1091N/A * @param column
0N/A * @return unicode properties
0N/A */
1091N/A public int getAdditional(int codepoint, int column) {
1091N/A if (column == -1) {
1091N/A return getProperty(codepoint);
1091N/A }
1091N/A if (column < 0 || column >= m_additionalColumnsCount_) {
1091N/A return 0;
1091N/A }
1091N/A return m_additionalVectors_[
1091N/A m_additionalTrie_.getCodePointValue(codepoint) + column];
0N/A }
0N/A
1091N/A /**
0N/A * <p>Get the "age" of the code point.</p>
0N/A * <p>The "age" is the Unicode version when the code point was first
0N/A * designated (as a non-character or for Private Use) or assigned a
0N/A * character.</p>
0N/A * <p>This can be useful to avoid emitting code points to receiving
0N/A * processes that do not accept newer characters.</p>
0N/A * <p>The data is from the UCD file DerivedAge.txt.</p>
0N/A * <p>This API does not check the validity of the codepoint.</p>
0N/A * @param codepoint The code point.
0N/A * @return the Unicode version number
0N/A */
0N/A public VersionInfo getAge(int codepoint)
0N/A {
1091N/A int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
0N/A return VersionInfo.getInstance(
0N/A (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
0N/A version & LAST_NIBBLE_MASK_, 0, 0);
0N/A }
0N/A
0N/A /**
0N/A * Forms a supplementary code point from the argument character<br>
0N/A * Note this is for internal use hence no checks for the validity of the
0N/A * surrogate characters are done
0N/A * @param lead lead surrogate character
0N/A * @param trail trailing surrogate character
0N/A * @return code point of the supplementary character
0N/A */
0N/A public static int getRawSupplementary(char lead, char trail)
0N/A {
0N/A return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
0N/A }
0N/A
0N/A /**
0N/A * Loads the property data and initialize the UCharacterProperty instance.
1091N/A * @throws MissingResourceException when data is missing or data has been corrupted
0N/A */
1091N/A public static UCharacterProperty getInstance()
0N/A {
1091N/A if(INSTANCE_ == null) {
0N/A try {
0N/A INSTANCE_ = new UCharacterProperty();
0N/A }
0N/A catch (Exception e) {
1091N/A throw new MissingResourceException(e.getMessage(),"","");
0N/A }
0N/A }
0N/A return INSTANCE_;
0N/A }
0N/A
0N/A /**
0N/A * Checks if the argument c is to be treated as a white space in ICU
0N/A * rules. Usually ICU rule white spaces are ignored unless quoted.
1091N/A * Equivalent to test for Pattern_White_Space Unicode property.
1091N/A * Stable set of characters, won't change.
1091N/A * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
0N/A * @param c codepoint to check
0N/A * @return true if c is a ICU white space
0N/A */
0N/A public static boolean isRuleWhiteSpace(int c)
0N/A {
0N/A /* "white space" in the sense of ICU rule parsers
0N/A This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
1091N/A See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
0N/A U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
1091N/A Equivalent to test for Pattern_White_Space Unicode property.
0N/A */
0N/A return (c >= 0x0009 && c <= 0x2029 &&
0N/A (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
0N/A c == 0x200E || c == 0x200F || c >= 0x2028));
0N/A }
0N/A
0N/A // protected variables -----------------------------------------------
0N/A
0N/A /**
0N/A * Extra property trie
0N/A */
0N/A CharTrie m_additionalTrie_;
0N/A /**
0N/A * Extra property vectors, 1st column for age and second for binary
0N/A * properties.
0N/A */
0N/A int m_additionalVectors_[];
0N/A /**
0N/A * Number of additional columns
0N/A */
0N/A int m_additionalColumnsCount_;
0N/A /**
0N/A * Maximum values for block, bits used as in vector word
0N/A * 0
0N/A */
0N/A int m_maxBlockScriptValue_;
0N/A /**
0N/A * Maximum values for script, bits used as in vector word
0N/A * 0
0N/A */
0N/A int m_maxJTGValue_;
0N/A
0N/A // private variables -------------------------------------------------
0N/A
0N/A /**
0N/A * UnicodeData.txt property object
0N/A */
0N/A private static UCharacterProperty INSTANCE_ = null;
0N/A
0N/A /**
0N/A * Default name of the datafile
0N/A */
0N/A private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
0N/A
0N/A /**
0N/A * Default buffer size of datafile
0N/A */
0N/A private static final int DATA_BUFFER_SIZE_ = 25000;
0N/A
0N/A /**
0N/A * Numeric value shift
0N/A */
1091N/A private static final int VALUE_SHIFT_ = 8;
0N/A
0N/A /**
0N/A * Mask to be applied after shifting to obtain an unsigned numeric value
0N/A */
1091N/A private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
0N/A
0N/A /**
0N/A * Shift value for lead surrogate to form a supplementary character.
0N/A */
0N/A private static final int LEAD_SURROGATE_SHIFT_ = 10;
0N/A /**
0N/A * Offset to add to combined surrogate pair to avoid msking.
0N/A */
0N/A private static final int SURROGATE_OFFSET_ =
0N/A UTF16.SUPPLEMENTARY_MIN_VALUE -
0N/A (UTF16.SURROGATE_MIN_VALUE <<
0N/A LEAD_SURROGATE_SHIFT_) -
0N/A UTF16.TRAIL_SURROGATE_MIN_VALUE;
0N/A
1091N/A // additional properties ----------------------------------------------
0N/A
0N/A /**
0N/A * First nibble shift
0N/A */
0N/A private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
0N/A /**
0N/A * Second nibble mask
0N/A */
0N/A private static final int LAST_NIBBLE_MASK_ = 0xF;
0N/A /**
0N/A * Age value shift
0N/A */
0N/A private static final int AGE_SHIFT_ = 24;
0N/A
0N/A // private constructors --------------------------------------------------
0N/A
0N/A /**
0N/A * Constructor
1091N/A * @exception IOException thrown when data reading fails or data corrupted
0N/A */
0N/A private UCharacterProperty() throws IOException
0N/A {
0N/A // jar access
0N/A InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
0N/A BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
0N/A UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
0N/A reader.read(this);
0N/A b.close();
0N/A
0N/A m_trie_.putIndexData(this);
0N/A }
0N/A
1091N/A public void upropsvec_addPropertyStarts(UnicodeSet set) {
1091N/A /* add the start code point of each same-value range of the properties vectors trie */
1091N/A if(m_additionalColumnsCount_>0) {
1091N/A /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
1091N/A TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
1091N/A RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
1091N/A while(propsVectorsIter.next(propsVectorsResult)){
1091N/A set.add(propsVectorsResult.start);
0N/A }
0N/A }
0N/A }
0N/A
0N/A}