0N/A/*
0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0N/A *
0N/A * This code is free software; you can redistribute it and/or modify it
0N/A * under the terms of the GNU General Public License version 2 only, as
2362N/A * published by the Free Software Foundation. Oracle designates this
0N/A * particular file as subject to the "Classpath" exception as provided
2362N/A * by Oracle in the LICENSE file that accompanied this code.
0N/A *
0N/A * This code is distributed in the hope that it will be useful, but WITHOUT
0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
0N/A * version 2 for more details (a copy is included in the LICENSE file that
0N/A * accompanied this code).
0N/A *
0N/A * You should have received a copy of the GNU General Public License version
0N/A * 2 along with this work; if not, write to the Free Software Foundation,
0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0N/A *
2362N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2362N/A * or visit www.oracle.com if you need additional information or have any
2362N/A * questions.
0N/A */
0N/A/*
0N/A/*
0N/A *******************************************************************************
0N/A * Copyright (C) 2003-2004, International Business Machines Corporation and *
0N/A * others. All Rights Reserved. *
0N/A *******************************************************************************
0N/A */
0N/A//
0N/A// CHANGELOG
0N/A// 2005-05-19 Edward Wang
0N/A// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
0N/A// - move from package com.ibm.icu.text to package sun.net.idn
0N/A// - use ParseException instead of StringPrepParseException
0N/A// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
0N/A// - remove all @deprecated tag to make compiler happy
0N/A// 2007-08-14 Martin Buchholz
0N/A// - remove redundant casts
0N/A//
0N/Apackage sun.net.idn;
0N/A
0N/Aimport java.io.BufferedInputStream;
0N/Aimport java.io.ByteArrayInputStream;
0N/Aimport java.io.IOException;
0N/Aimport java.io.InputStream;
0N/Aimport java.text.ParseException;
0N/A
0N/Aimport sun.text.Normalizer;
0N/Aimport sun.text.normalizer.CharTrie;
0N/Aimport sun.text.normalizer.Trie;
0N/Aimport sun.text.normalizer.NormalizerImpl;
0N/Aimport sun.text.normalizer.VersionInfo;
0N/Aimport sun.text.normalizer.UCharacter;
0N/Aimport sun.text.normalizer.UCharacterIterator;
0N/Aimport sun.text.normalizer.UTF16;
0N/Aimport sun.net.idn.UCharacterDirection;
0N/Aimport sun.net.idn.StringPrepDataReader;
0N/A
0N/A/**
0N/A * StringPrep API implements the StingPrep framework as described by
0N/A * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
0N/A * StringPrep prepares Unicode strings for use in network protocols.
0N/A * Profiles of StingPrep are set of rules and data according to which the
0N/A * Unicode Strings are prepared. Each profiles contains tables which describe
0N/A * how a code point should be treated. The tables are broadly classied into
0N/A * <ul>
0N/A * <li> Unassigned Table: Contains code points that are unassigned
0N/A * in the Unicode Version supported by StringPrep. Currently
0N/A * RFC 3454 supports Unicode 3.2. </li>
0N/A * <li> Prohibited Table: Contains code points that are prohibted from
0N/A * the output of the StringPrep processing function. </li>
0N/A * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
0N/A * </ul>
0N/A *
0N/A * The procedure for preparing Unicode strings:
0N/A * <ol>
0N/A * <li> Map: For each character in the input, check if it has a mapping
0N/A * and, if so, replace it with its mapping. </li>
0N/A * <li> Normalize: Possibly normalize the result of step 1 using Unicode
0N/A * normalization. </li>
0N/A * <li> Prohibit: Check for any characters that are not allowed in the
0N/A * output. If any are found, return an error.</li>
0N/A * <li> Check bidi: Possibly check for right-to-left characters, and if
0N/A * any are found, make sure that the whole string satisfies the
0N/A * requirements for bidirectional strings. If the string does not
0N/A * satisfy the requirements for bidirectional strings, return an
0N/A * error. </li>
0N/A * </ol>
0N/A * @author Ram Viswanadha
0N/A * @draft ICU 2.8
0N/A */
0N/Apublic final class StringPrep {
0N/A /**
0N/A * Option to prohibit processing of unassigned code points in the input
0N/A *
0N/A * @see #prepare
0N/A * @draft ICU 2.8
0N/A */
0N/A public static final int DEFAULT = 0x0000;
0N/A
0N/A /**
0N/A * Option to allow processing of unassigned code points in the input
0N/A *
0N/A * @see #prepare
0N/A * @draft ICU 2.8
0N/A */
0N/A public static final int ALLOW_UNASSIGNED = 0x0001;
0N/A
0N/A private static final int UNASSIGNED = 0x0000;
0N/A private static final int MAP = 0x0001;
0N/A private static final int PROHIBITED = 0x0002;
0N/A private static final int DELETE = 0x0003;
0N/A private static final int TYPE_LIMIT = 0x0004;
0N/A
0N/A private static final int NORMALIZATION_ON = 0x0001;
0N/A private static final int CHECK_BIDI_ON = 0x0002;
0N/A
0N/A private static final int TYPE_THRESHOLD = 0xFFF0;
0N/A private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/
0N/A private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
0N/A
0N/A /* indexes[] value names */
0N/A private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
0N/A private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
0N/A private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
0N/A private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */
0N/A private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */
0N/A private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
0N/A private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
0N/A private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
0N/A private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
0N/A
0N/A
0N/A /**
0N/A * Default buffer size of datafile
0N/A */
0N/A private static final int DATA_BUFFER_SIZE = 25000;
0N/A
0N/A /* Wrappers for Trie implementations */
0N/A private static final class StringPrepTrieImpl implements Trie.DataManipulate{
0N/A private CharTrie sprepTrie = null;
0N/A /**
0N/A * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
0N/A * data the index array offset of the indexes for that lead surrogate.
0N/A * @param property data value for a surrogate from the trie, including
0N/A * the folding offset
0N/A * @return data offset or 0 if there is no data for the lead surrogate
0N/A */
0N/A public int getFoldingOffset(int value){
0N/A return value;
0N/A }
0N/A }
0N/A
0N/A // CharTrie implmentation for reading the trie data
0N/A private StringPrepTrieImpl sprepTrieImpl;
0N/A // Indexes read from the data file
0N/A private int[] indexes;
0N/A // mapping data read from the data file
0N/A private char[] mappingData;
0N/A // format version of the data file
0N/A private byte[] formatVersion;
0N/A // the version of Unicode supported by the data file
0N/A private VersionInfo sprepUniVer;
0N/A // the Unicode version of last entry in the
0N/A // NormalizationCorrections.txt file if normalization
0N/A // is turned on
0N/A private VersionInfo normCorrVer;
0N/A // Option to turn on Normalization
0N/A private boolean doNFKC;
0N/A // Option to turn on checking for BiDi rules
0N/A private boolean checkBiDi;
0N/A
0N/A
0N/A private char getCodePointValue(int ch){
0N/A return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
0N/A }
0N/A
0N/A private static VersionInfo getVersionInfo(int comp){
0N/A int micro = comp & 0xFF;
0N/A int milli =(comp >> 8) & 0xFF;
0N/A int minor =(comp >> 16) & 0xFF;
0N/A int major =(comp >> 24) & 0xFF;
0N/A return VersionInfo.getInstance(major,minor,milli,micro);
0N/A }
0N/A private static VersionInfo getVersionInfo(byte[] version){
0N/A if(version.length != 4){
0N/A return null;
0N/A }
0N/A return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
0N/A }
0N/A /**
0N/A * Creates an StringPrep object after reading the input stream.
0N/A * The object does not hold a reference to the input steam, so the stream can be
0N/A * closed after the method returns.
0N/A *
0N/A * @param inputStream The stream for reading the StringPrep profile binarySun
0N/A * @throws IOException
0N/A * @draft ICU 2.8
0N/A */
0N/A public StringPrep(InputStream inputStream) throws IOException{
0N/A
0N/A BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
0N/A
0N/A StringPrepDataReader reader = new StringPrepDataReader(b);
0N/A
0N/A // read the indexes
0N/A indexes = reader.readIndexes(INDEX_TOP);
0N/A
0N/A byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
0N/A
0N/A
0N/A //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
0N/A mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
0N/A // load the rest of the data data and initialize the data members
0N/A reader.read(sprepBytes,mappingData);
0N/A
0N/A sprepTrieImpl = new StringPrepTrieImpl();
0N/A sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl );
0N/A
0N/A // get the data format version
0N/A formatVersion = reader.getDataFormatVersion();
0N/A
0N/A // get the options
0N/A doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
0N/A checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
0N/A sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
0N/A normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
0N/A VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion();
0N/A if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
0N/A normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
0N/A ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
0N/A ){
0N/A throw new IOException("Normalization Correction version not supported");
0N/A }
0N/A b.close();
0N/A }
0N/A
0N/A private static final class Values{
0N/A boolean isIndex;
0N/A int value;
0N/A int type;
0N/A public void reset(){
0N/A isIndex = false;
0N/A value = 0;
0N/A type = -1;
0N/A }
0N/A }
0N/A
0N/A private static final void getValues(char trieWord,Values values){
0N/A values.reset();
0N/A if(trieWord == 0){
0N/A /*
0N/A * Initial value stored in the mapping table
0N/A * just return TYPE_LIMIT .. so that
0N/A * the source codepoint is copied to the destination
0N/A */
0N/A values.type = TYPE_LIMIT;
0N/A }else if(trieWord >= TYPE_THRESHOLD){
0N/A values.type = (trieWord - TYPE_THRESHOLD);
0N/A }else{
0N/A /* get the type */
0N/A values.type = MAP;
0N/A /* ascertain if the value is index or delta */
0N/A if((trieWord & 0x02)>0){
0N/A values.isIndex = true;
0N/A values.value = trieWord >> 2; //mask off the lower 2 bits and shift
0N/A
0N/A }else{
0N/A values.isIndex = false;
0N/A values.value = (trieWord<<16)>>16;
0N/A values.value = (values.value >> 2);
0N/A
0N/A }
0N/A
0N/A if((trieWord>>2) == MAX_INDEX_VALUE){
0N/A values.type = DELETE;
0N/A values.isIndex = false;
0N/A values.value = 0;
0N/A }
0N/A }
0N/A }
0N/A
0N/A
0N/A
0N/A private StringBuffer map( UCharacterIterator iter, int options)
0N/A throws ParseException {
0N/A
0N/A Values val = new Values();
0N/A char result = 0;
0N/A int ch = UCharacterIterator.DONE;
0N/A StringBuffer dest = new StringBuffer();
0N/A boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
0N/A
0N/A while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
0N/A
0N/A result = getCodePointValue(ch);
0N/A getValues(result,val);
0N/A
0N/A // check if the source codepoint is unassigned
0N/A if(val.type == UNASSIGNED && allowUnassigned == false){
0N/A throw new ParseException("An unassigned code point was found in the input " +
0N/A iter.getText(), iter.getIndex());
0N/A }else if((val.type == MAP)){
0N/A int index, length;
0N/A
0N/A if(val.isIndex){
0N/A index = val.value;
0N/A if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
0N/A index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
0N/A length = 1;
0N/A }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
0N/A index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
0N/A length = 2;
0N/A }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
0N/A index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
0N/A length = 3;
0N/A }else{
0N/A length = mappingData[index++];
0N/A }
0N/A /* copy mapping to destination */
0N/A dest.append(mappingData,index,length);
0N/A continue;
0N/A
0N/A }else{
0N/A ch -= val.value;
0N/A }
0N/A }else if(val.type == DELETE){
0N/A // just consume the codepoint and contine
0N/A continue;
0N/A }
0N/A //copy the source into destination
0N/A UTF16.append(dest,ch);
0N/A }
0N/A
0N/A return dest;
0N/A }
0N/A
0N/A
0N/A private StringBuffer normalize(StringBuffer src){
0N/A /*
0N/A * Option UNORM_BEFORE_PRI_29:
0N/A *
0N/A * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
0N/A * requires strict adherence to Unicode 3.2 normalization,
0N/A * including buggy composition from before fixing Public Review Issue #29.
0N/A * Note that this results in some valid but nonsensical text to be
0N/A * either corrupted or rejected, depending on the text.
0N/A * See http://www.unicode.org/review/resolved-pri.html#pri29
0N/A * See unorm.cpp and cnormtst.c
0N/A */
0N/A return new StringBuffer(
0N/A Normalizer.normalize(
0N/A src.toString(),
0N/A java.text.Normalizer.Form.NFKC,
0N/A Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));
0N/A }
0N/A /*
0N/A boolean isLabelSeparator(int ch){
0N/A int result = getCodePointValue(ch);
0N/A if( (result & 0x07) == LABEL_SEPARATOR){
0N/A return true;
0N/A }
0N/A return false;
0N/A }
0N/A */
0N/A /*
0N/A 1) Map -- For each character in the input, check if it has a mapping
0N/A and, if so, replace it with its mapping.
0N/A
0N/A 2) Normalize -- Possibly normalize the result of step 1 using Unicode
0N/A normalization.
0N/A
0N/A 3) Prohibit -- Check for any characters that are not allowed in the
0N/A output. If any are found, return an error.
0N/A
0N/A 4) Check bidi -- Possibly check for right-to-left characters, and if
0N/A any are found, make sure that the whole string satisfies the
0N/A requirements for bidirectional strings. If the string does not
0N/A satisfy the requirements for bidirectional strings, return an
0N/A error.
0N/A [Unicode3.2] defines several bidirectional categories; each character
0N/A has one bidirectional category assigned to it. For the purposes of
0N/A the requirements below, an "RandALCat character" is a character that
0N/A has Unicode bidirectional categories "R" or "AL"; an "LCat character"
0N/A is a character that has Unicode bidirectional category "L". Note
0N/A
0N/A
0N/A that there are many characters which fall in neither of the above
0N/A definitions; Latin digits (<U+0030> through <U+0039>) are examples of
0N/A this because they have bidirectional category "EN".
0N/A
0N/A In any profile that specifies bidirectional character handling, all
0N/A three of the following requirements MUST be met:
0N/A
0N/A 1) The characters in section 5.8 MUST be prohibited.
0N/A
0N/A 2) If a string contains any RandALCat character, the string MUST NOT
0N/A contain any LCat character.
0N/A
0N/A 3) If a string contains any RandALCat character, a RandALCat
0N/A character MUST be the first character of the string, and a
0N/A RandALCat character MUST be the last character of the string.
0N/A */
0N/A /**
0N/A * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
0N/A * checks for prohited and BiDi characters in the order defined by RFC 3454
0N/A * depending on the options specified in the profile.
0N/A *
0N/A * @param src A UCharacterIterator object containing the source string
0N/A * @param options A bit set of options:
0N/A *
0N/A * - StringPrep.NONE Prohibit processing of unassigned code points in the input
0N/A *
0N/A * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input
0N/A * as normal Unicode code points.
0N/A *
0N/A * @return StringBuffer A StringBuffer containing the output
0N/A * @throws ParseException
0N/A * @draft ICU 2.8
0N/A */
0N/A public StringBuffer prepare(UCharacterIterator src, int options)
0N/A throws ParseException{
0N/A
0N/A // map
0N/A StringBuffer mapOut = map(src,options);
0N/A StringBuffer normOut = mapOut;// initialize
0N/A
0N/A if(doNFKC){
0N/A // normalize
0N/A normOut = normalize(mapOut);
0N/A }
0N/A
0N/A int ch;
0N/A char result;
0N/A UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
0N/A Values val = new Values();
0N/A int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
0N/A firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
0N/A int rtlPos=-1, ltrPos=-1;
0N/A boolean rightToLeft=false, leftToRight=false;
0N/A
0N/A while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
0N/A result = getCodePointValue(ch);
0N/A getValues(result,val);
0N/A
0N/A if(val.type == PROHIBITED ){
0N/A throw new ParseException("A prohibited code point was found in the input" +
0N/A iter.getText(), val.value);
0N/A }
0N/A
0N/A direction = UCharacter.getDirection(ch);
0N/A if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
0N/A firstCharDir = direction;
0N/A }
0N/A if(direction == UCharacterDirection.LEFT_TO_RIGHT){
0N/A leftToRight = true;
0N/A ltrPos = iter.getIndex()-1;
0N/A }
0N/A if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
0N/A rightToLeft = true;
0N/A rtlPos = iter.getIndex()-1;
0N/A }
0N/A }
0N/A if(checkBiDi == true){
0N/A // satisfy 2
0N/A if( leftToRight == true && rightToLeft == true){
0N/A throw new ParseException("The input does not conform to the rules for BiDi code points." +
0N/A iter.getText(),
0N/A (rtlPos>ltrPos) ? rtlPos : ltrPos);
0N/A }
0N/A
0N/A //satisfy 3
0N/A if( rightToLeft == true &&
0N/A !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
0N/A (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
0N/A ){
0N/A throw new ParseException("The input does not conform to the rules for BiDi code points." +
0N/A iter.getText(),
0N/A (rtlPos>ltrPos) ? rtlPos : ltrPos);
0N/A }
0N/A }
0N/A return normOut;
0N/A
0N/A }
0N/A}