/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
*/
public final class UCharacterProperty
{
// public data members -----------------------------------------------
/**
* Trie data
*/
/**
* Optimization
* CharTrie index array
*/
public char[] m_trieIndex_;
/**
* Optimization
* CharTrie data array
*/
public char[] m_trieData_;
/**
* Optimization
* CharTrie data offset
*/
public int m_trieInitialValue_;
/**
* Unicode version
*/
// uprops.h enum UPropertySource --------------------------------------- ***
/** From uchar.c/uprops.icu properties vectors trie */
/** One more than the highest UPropertySource (SRC_) constant. */
// public methods ----------------------------------------------------
/**
* Java friends implementation
*/
{
}
/**
* Gets the property value at the index.
* This is optimized.
* Note this is alittle different from CharTrie the index m_trieData_
* is never negative.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
{
// BMP codepoint 0000..D7FF or DC00..FFFF
// optimized
try { // using try for ch < 0 is faster than using an if statement
return m_trieData_[
} catch (ArrayIndexOutOfBoundsException e) {
return m_trieInitialValue_;
}
}
// lead surrogate D800..DBFF
return m_trieData_[
}
// supplementary code point 10000..10FFFF
// look at the construction of supplementary characters
// trail forms the ends of it.
return m_trie_.getSurrogateValue(
}
// ch is out of bounds
// return m_dataOffset_ if there is an error, in this case we return
// the default value: m_initialValue_
// we cannot assume that m_initialValue_ is at offset 0
// this is for optimization.
return m_trieInitialValue_;
// this all is an inlined form of return m_trie_.getCodePointValue(ch);
}
/**
* Getting the unsigned numeric value of a character embedded in the property
* argument
* @param prop the character
* @return unsigned numberic value
*/
{
}
/**
* Gets the unicode additional properties.
* C version getUnicodeProperties.
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @param column
* @return unicode properties
*/
if (column == -1) {
return getProperty(codepoint);
}
return 0;
}
return m_additionalVectors_[
}
/**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.</p>
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* <p>This API does not check the validity of the codepoint.</p>
* @param codepoint The code point.
* @return the Unicode version number
*/
{
return VersionInfo.getInstance(
}
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
{
}
/**
* Loads the property data and initialize the UCharacterProperty instance.
* @throws MissingResourceException when data is missing or data has been corrupted
*/
{
try {
INSTANCE_ = new UCharacterProperty();
}
catch (Exception e) {
}
}
return INSTANCE_;
}
/**
* Checks if the argument c is to be treated as a white space in ICU
* rules. Usually ICU rule white spaces are ignored unless quoted.
* Equivalent to test for Pattern_White_Space Unicode property.
* Stable set of characters, won't change.
* See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
* @param c codepoint to check
* @return true if c is a ICU white space
*/
public static boolean isRuleWhiteSpace(int c)
{
/* "white space" in the sense of ICU rule parsers
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
Equivalent to test for Pattern_White_Space Unicode property.
*/
return (c >= 0x0009 && c <= 0x2029 &&
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||
c == 0x200E || c == 0x200F || c >= 0x2028));
}
// protected variables -----------------------------------------------
/**
* Extra property trie
*/
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
*/
int m_additionalVectors_[];
/**
* Number of additional columns
*/
/**
* Maximum values for block, bits used as in vector word
* 0
*/
/**
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
// private variables -------------------------------------------------
/**
* UnicodeData.txt property object
*/
/**
* Default name of the datafile
*/
/**
* Default buffer size of datafile
*/
/**
* Numeric value shift
*/
/**
* Mask to be applied after shifting to obtain an unsigned numeric value
*/
/**
* Shift value for lead surrogate to form a supplementary character.
*/
/**
* Offset to add to combined surrogate pair to avoid msking.
*/
private static final int SURROGATE_OFFSET_ =
// additional properties ----------------------------------------------
/**
* First nibble shift
*/
/**
* Second nibble mask
*/
/**
* Age value shift
*/
// private constructors --------------------------------------------------
/**
* Constructor
* @exception IOException thrown when data reading fails or data corrupted
*/
{
// jar access
b.close();
m_trie_.putIndexData(this);
}
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
}
}
}
}