/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
/**
* @author Ram Viswanadha
*/
public final class NormalizerImpl {
// Static block for the class to initialize its own self
static
{
try
{
IMPL = new NormalizerImpl();
}
catch (Exception e)
{
throw new RuntimeException(e.getMessage());
}
}
/*
* This new implementation of the normalization code loads its data from
* unorm.icu, which is generated with the gennorm tool.
* The format of that file is described at the end of this file.
*/
// norm32 value constants
// quick check flags 0..3 set mean "no" for their forms
/* quick check flags 4..5 mean "maybe" for their forms;
* test flags>=QC_MAYBE
*/
// UnicodeData.txt combining class in bits 15.
// 16 bits for the index to UChars and other extra data
/* norm32 value constants using >16 bits */
// private static final long MIN_JAMO_V = (long)(0xfff20000 & UNSIGNED_INT_MASK);
/* indexes[] value names */
/* number of bytes in normalization trie */
/* number of chars in extra data */
/* number of uint16_t words for combining data */
/* number of bytes in FCD trie */
/* number of bytes in the auxiliary trie */
/* changing this requires a new formatVersion */
/* AUX constants */
/* value constants for auxTrie */
/*******************************/
/* Wrappers for Trie implementations */
/**
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
* data the index array offset of the indexes for that lead surrogate.
* @param property data value for a surrogate from the trie, including
* the folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
return BMP_INDEX_LENGTH+
(0x3ff<<SURROGATE_BLOCK_BITS));
}
}
/**
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
* data the index array offset of the indexes for that lead surrogate.
* @param property data value for a surrogate from the trie, including
* the folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
/* fcdTrie: the folding offset is the lead FCD value itself */
return value;
}
}
/**
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
* data the index array offset of the indexes for that lead surrogate.
* @param property data value for a surrogate from the trie, including
* the folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
}
}
/****************************************************/
private static int[] indexes;
private static char[] combiningTable;
private static char[] extraData;
private static boolean isDataLoaded;
private static boolean isFormatVersion_2_1;
private static boolean isFormatVersion_2_2;
private static byte[] unicodeVersion;
/**
* Default buffer size of datafile
*/
/**
* FCD check: everything below this code point is known to have a 0
* lead combining class
*/
/**
* Bit 7 of the length byte for a decomposition string in extra data is
* a flag indicating whether the decomposition string is
* preceded by a 16-bit word with the leading and trailing cc
* of the decomposition (like for A-umlaut);
* if not, then both cc's are zero (like for compatibility ideographs).
*/
/**
* Bits 6..0 of the length byte contain the actual length.
*/
/** Length of the BMP portion of the index (stage 1) array. */
/** Number of bits of a trail surrogate that are used in index table
* lookups.
*/
// public utility
}
// protected constructor ---------------------------------------------
/**
* Constructor
* @exception thrown when data reading fails or data corrupted
*/
//data should be loaded only once
if(!isDataLoaded){
// jar access
// read the indexes
combiningTable = new char[combiningTableTop];
extraData = new char[extraDataTop];
fcdTrieImpl = new FCDTrieImpl();
normTrieImpl = new NormTrieImpl();
auxTrieImpl = new AuxTrieImpl();
// load the rest of the data data and initialize the data members
// we reached here without any exceptions so the data is fully
// loaded set the variable to true
isDataLoaded = true;
// get the data format version
||
);
||
);
b.close();
}
}
/* ---------------------------------------------------------------------- */
/* Korean Hangul and Jamo constants */
private static boolean isHangulWithoutJamoT(char c) {
c-=HANGUL_BASE;
}
/* norm32 helpers */
/* is this a norm32 with a regular index? */
return norm32<MIN_SPECIAL;
}
/* is this a norm32 with a special index for a lead surrogate? */
}
/* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
return norm32>=MIN_HANGUL;
}
/*
* Given norm32 for Jamo V or T,
* is this a Jamo V?
*/
return norm32<JAMO_V_TOP;
}
/* data access primitives ----------------------------------------------- */
}
char c2) {
/*
* the surrogate index in norm32 stores only the number of the surrogate
* index block see gennorm/store.c/getFoldedNormValue()
*/
return ((UNSIGNED_INT_MASK) &
}
///CLOVER:OFF
private static long getNorm32(int c){
}
/*
* get a norm32 from text with complete code points
* (like from decompositions)
*/
int/*unsigned*/ mask) {
/* *p is a lead surrogate, get the real norm32 */
}
return norm32;
}
//// for StringPrep
}
public static char getFCD16(char c) {
}
/* the surrogate index in fcd16 is an absolute offset over the
* start of stage 1
* */
}
public static int getFCD16(int c) {
}
return (int)(norm32>>EXTRA_SHIFT);
}
private static final class DecomposeArgs{
int length;
}
/**
*
* get the canonical or compatibility decomposition for one character
*
* @return index into the extraData array
*/
int/*unsigned*/ qcMask,
int p= getExtraDataIndex(norm32);
/* use compatibility decomposition, skip canonical data */
}
/* get the lead and trail cc's */
} else {
/* lead and trail cc's are both 0 */
}
return p;
}
/**
* get the canonical decomposition for one character
* @return index into the extraData array
*/
int p= getExtraDataIndex(norm32);
/* get the lead and trail cc's */
} else {
/* lead and trail cc's are both 0 */
}
return p;
}
private static final class NextCCArgs{
char[] source;
int next;
int limit;
char c;
char c2;
}
/*
* get the combining class of (c, c2)= args.source[args.next++]
* before: args.next<args.limit after: args.next<=args.limit
* if only one code unit is used, then c2==0
*/
long /*unsigned*/ norm32;
return 0;
} else {
if(!isNorm32LeadSurrogate(norm32)) {
} else {
/* c is a lead surrogate, get the real norm32 */
} else {
return 0;
}
}
}
}
private static final class PrevArgs{
char[] src;
int start;
int current;
char c;
char c2;
}
/*
* read backwards and get norm32
* return 0 if the character is <minC
* if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
* surrogate but read second!)
*/
int/*unsigned*/ minC,
int/*unsigned*/ mask) {
long/*unsigned*/ norm32;
/* check for a surrogate before getting norm32 to see if we need to
* predecrement further
*/
return 0;
/* unpaired first surrogate */
return 0;
/* all surrogate pairs with this lead surrogate have
* only irrelevant data
*/
return 0;
} else {
/* norm32 must be a surrogate special */
}
} else {
/* unpaired second surrogate */
return 0;
}
}
/*
* get the combining class of (c, c2)=*--p
* before: start<p after: start<=p
*/
}
/*
* is this a safe boundary character for NF*D?
* (lead cc==0)
*/
int/*unsigned*/ccOrQCMask,
int/*unsigned*/ decompQCMask) {
return true; /* cc==0 and no decomposition: this is NF*D safe */
}
/* inspect its decomposition - maybe a Hangul but not a surrogate here*/
/* decomposes, get everything from the variable-length extra data */
} else {
/* no decomposition (or Hangul), test the cc directly */
}
}
/*
* is this (or does its decomposition begin with) a "true starter"?
* (cc==0 and NF*C_YES)
*/
int/*unsigned*/ ccOrQCMask,
int/*unsigned*/ decompQCMask) {
return true; /* this is a true starter (could be Hangul or Jamo L)*/
}
/* inspect its decomposition - not a Hangul or a surrogate here */
int p; /* index into extra data array */
/* decomposes, get everything from the variable-length extra data */
/* does it begin with NFC_YES? */
/* yes, the decomposition begins with a true starter */
return true;
}
}
}
return false;
}
/* reorder UTF-16 in-place ---------------------------------------------- */
/**
* simpler, single-character version of mergeOrdered() -
* bubble-insert one single code point into the preceding string
* which is already canonically ordered
* (c, c2) may or may not yet have been inserted at src[current]..src[p]
*
* it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
*
* before: src[start]..src[current] is already ordered, and
* src[current]..src[p] may or may not hold (c, c2) but
* must be exactly the same length as (c, c2)
* after: src[start]..src[p] is ordered
*
* @return the trailing combining class
*/
int start,
int current, int p,
char c, char c2,
int/*unsigned byte*/ cc) {
int r;
// search for the insertion point where cc>=prevCC
// get the prevCC
// this will be the last code point, so keep its cc
break;
}
}
// this is where we are right now with all these indicies:
// [start]..[pPreBack] 0..? code points that we can ignore
// [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
// [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
// [current]..[p] 1 code point (c, c2) with cc
// move the code units in between up
r=p;
do {
}
}
// insert (c, c2)
if(c2!=0) {
}
// we know the cc of the last code point
return trailCC;
}
/**
* merge two UTF-16 string parts together
* to canonically order (order by combining classes) their concatenation
*
* the two strings may already be adjacent, so that the merging is done
* in-place if the two strings are not adjacent, then the buffer holding the
* first one must be large enough
* the second string may or may not be ordered in itself
*
* before: [start]..[current] is already ordered, and
* [next]..[limit] may be ordered in itself, but
* is not in relation to [start..current[
* after: [start..current+(limit-next)[ is ordered
*
* the algorithm is a simple bubble-sort that takes the characters from
* src[next++] and inserts them in correct combining class order into the
* preceding part of the string
*
* since this function is called much less often than the single-code point
* insertOrdered(), it just uses that for easier maintenance
*
* @return the trailing combining class
*/
int start,
int current,
char[] data,
int next,
int limit,
boolean isOrdered) {
int r;
boolean adjacent;
if(cc==0) {
// does not bubble back
trailCC=0;
if(adjacent) {
} else {
}
}
if(isOrdered) {
break;
} else {
}
} else {
current=r;
}
}
}
// we know the cc of the last code point
return trailCC;
} else {
if(!adjacent) {
// copy the second string part
do {
}
}
}
int start,
int current,
char[] data,
final int next,
final int limit) {
}
int srcStart,
int srcLimit,
int minNoMaybe,
int qcMask,
int options,
boolean allowMaybe,
UnicodeSet nx){
int ccOrQCMask;
long norm32;
char c, c2;
long qcNorm32;
char[] buffer ;
if(!isDataLoaded) {
return NormalizerBase.MAYBE;
}
// initialize
prevCC=0;
for(;;) {
for(;;) {
return result;
break;
}
prevCC=0;
}
// check one above-minimum, relevant code unit
if(isNorm32LeadSurrogate(norm32)) {
// c is a lead surrogate, get the real norm32
++srcStart;
} else {
norm32=0;
c2=0;
}
}else{
c2=0;
}
/* excluded: norm32==0 */
norm32=0;
}
// check the combining order
return NormalizerBase.NO;
}
// check for "no" or "maybe" quick check flags
break;
} else if(qcNorm32!=0) {
// "maybe" can only occur for NFC and NFKC
if(allowMaybe){
}else{
// normalize a section around here to see if it is really
// normalized or not
int prevStarter;
int/*unsigned*/ decompQCMask;
// find the previous starter
// set prevStarter to the beginning of the current character
// safe because unpaired surrogates do not result
// in "maybe"
--prevStarter;
}
(char)minNoMaybe);
// find the next true starter in [src..limit[ - modifies
// src to point to the next starter
decompQCMask,(char) minNoMaybe);
//set the args for compose part
// decompose and recompose [prevStarter..src[
// compare the normalized version with the original
break;
}
// continue after the next starter
}
}
}
return result;
}
//------------------------------------------------------
// make NFD & NFKD
//------------------------------------------------------
boolean compat,int[] outTrailCC,
UnicodeSet nx) {
char[] buffer = new char[3];
int prevSrc;
long norm32;
int ccOrQCMask, qcMask;
int reorderStartIndex, length;
char c, c2, minNoMaybe;
char[] p;
int pStart;
if(!compat) {
} else {
}
/* initialize */
prevCC=0;
norm32=0;
c=0;
pStart=0;
for(;;) {
/* count code units below the minimum or with irrelevant data for
* the quick check
*/
prevCC=0;
++srcIndex;
}
/* copy these code units all at once */
}
}
/* end of source reached? */
break;
}
/* c already contains *src and norm32 is set for it, increment src*/
++srcIndex;
/* check one above-minimum, relevant code unit */
/*
* generally, set p and length to the decomposition string
* in simple cases, p==NULL and (c, c2) will hold the length code
* units to append in all cases, set cc to the lead and trailCC to
* the trail combining class
*
* the following merge-sort of the current character into the
* preceding, canonically ordered result text will use the
* optimized insertOrdered()
* if there is only one single code point to process;
* this is indicated with p==NULL, and (c, c2) is the character to
* insert
* ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
* for a supplementary character)
* otherwise, p[length] is merged in with _mergeOrdered()
*/
if(isNorm32HangulOrJamo(norm32)) {
if(nx_contains(nx, c)) {
c2=0;
p=null;
length=1;
} else {
// Hangul syllable: decompose algorithmically
p=buffer;
pStart=0;
c-=HANGUL_BASE;
c2=(char)(c%JAMO_T_COUNT);
c/=JAMO_T_COUNT;
if(c2>0) {
length=3;
} else {
length=2;
}
}
} else {
if(isNorm32Regular(norm32)) {
c2=0;
length=1;
} else {
// c is a lead surrogate, get the real norm32
++srcIndex;
length=2;
} else {
c2=0;
length=1;
norm32=0;
}
}
/* get the decomposition and the lead and trail cc's */
/* excluded: norm32==0 */
p=null;
/* c does not decompose */
p=null;
pStart=-1;
} else {
/* c decomposes, get everything from the variable-length
* extra data
*/
p=extraData;
if(length==1) {
/* fastpath a single code unit from decomposition */
c=p[pStart];
c2=0;
p=null;
pStart=-1;
}
}
}
/* append the decomposition to the destination buffer, assume
* length>0
*/
int reorderSplit=destIndex;
if(p==null) {
/* fastpath: single code point */
/* (c, c2) is out of order with respect to the preceding
* text
*/
} else {
/* just append (c, c2) */
if(c2!=0) {
}
}
} else {
/* general: multiple code points (ordered by themselves)
* from decomposition
*/
/* the decomposition is out of order with respect to the
* preceding text
*/
} else {
/* just append the decomposition */
do {
} while(--length>0);
}
}
} else {
/* buffer overflow */
/* keep incrementing the destIndex for preflighting */
}
if(prevCC==0) {
}
}
}
/* make NFC & NFKC ------------------------------------------------------ */
private static final class NextCombiningArgs{
char[] source;
int start;
//int limit;
char c;
char c2;
}
/* get the composition properties of the next character */
int limit,
UnicodeSet nx) {
long/*unsigned*/ norm32;
int combineFlags;
/* get properties */
/* preset output values for most characters */
return 0;
} else {
if(isNorm32Regular(norm32)) {
/* set cc etc. below */
} else if(isNorm32HangulOrJamo(norm32)) {
/* a compatibility decomposition contained Jamos */
(norm32>>EXTRA_SHIFT)));
return (int)(norm32&COMBINES_ANY);
} else {
/* c is a lead surrogate, get the real norm32 */
} else {
return 0;
}
}
return 0; /* excluded: norm32==0 */
}
if(combineFlags!=0) {
}
return combineFlags;
}
}
/*
* given a composition-result starter (c, c2) - which means its cc==0,
* it combines forward, it has extra data, its norm32!=0,
* it is not a Hangul or Jamo,
* get just its combineFwdIndex
*
* norm32(c) is special if and only if c2!=0
*/
long/*unsigned*/ norm32;
if(c2!=0) {
}
}
/*
* Find the recomposition result for
* a forward-combining character
* (specified with a pointer to its part of the combiningTable[])
* and a backward-combining character
* (specified with its combineBackIndex).
*
* If these two characters combine, then set (value, value2)
* with the code unit(s) of the composition character.
*
* Return value:
* 0 do not combine
* 1 combine
* >1 combine, and the composition is a forward-combining starter
*
* See unormimp.h for a description of the composition table format.
*/
int/*unsinged*/ combineBackIndex,
int[] outValues) {
int/*unsigned*/ key;
throw new IllegalArgumentException();
}
/* search in the starter's composition table */
for(;;) {
if(key>=combineBackIndex) {
break;
}
}
/* mask off bit 15, the last-entry-in-the-list flag */
/* found! combine! */
/* is the composition a starter that combines forward? */
/* get the composition result code point from the variable-length
* result value
*/
/* surrogate pair composition result */
} else {
/* BMP composition result U+2000..U+ffff */
value2=0;
}
} else {
/* BMP composition result U+0000..U+1fff */
value&=0x1fff;
value2=0;
}
return key;
} else {
/* not found */
return 0;
}
}
private static final class RecomposeArgs{
char[] source;
int start;
int limit;
}
/*
* recompose the characters in [p..limit[
* (which is in NFD - decomposed and canonically ordered),
* adjust limit, and return the trailing cc
*
* since for NFKC we may get Jamos in decompositions, we need to
* recompose those too
*
* note that recomposition never lengthens the text:
* any character consists of either one or two code units;
* a composition may contain at most one more code unit than the original
* starter, while the combining mark that is removed has at least one code
* unit
*/
int remove, q, r;
int /*unsigned*/ combineFlags;
int /*unsigned byte*/ prevCC;
boolean starterIsSupplementary;
int starter;
int[] outValues = new int[2];
starterIsSupplementary=false; /* will not be used until starter!=NULL */
prevCC=0;
for(;;) {
/* c is a Jamo V/T, see if we can compose it with the
* previous character
*/
/* for the PRI #29 fix, check that there is no intervening combining mark */
combineFlags=0;
if(combineBackIndex==0xfff2) {
/* Jamo V, compose with previous Jamo L and following
* Jamo T
*/
-JAMO_T_BASE))<JAMO_T_COUNT) {
} else {
/* the result is an LV syllable, which is a starter (unlike LVT) */
}
} else {
/* excluded */
if(!isHangulWithoutJamoT(ncArg.c)) {
}
/* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
}
}
/*
* Normally, the following can not occur:
* Since the input is in NFD, there are no Hangul LV syllables that
* a Jamo T could combine with.
* All Jamo Ts are combined above when handling Jamo Vs.
*
* However, before the PRI #29 fix, this can occur due to
* an intervening combining mark between the Hangul LV and the Jamo T.
*/
} else {
/* Jamo T, compose with previous Hangul that does not have a Jamo T */
}
}
}
if(remove!=-1) {
/* remove the Jamo(s) */
q=remove;
}
}
if(combineFlags!=0) {
/*
* not starter=NULL because the composition is a Hangul LV syllable
* and might combine once more (but only before the PRI #29 fix)
*/
/* done? */
return (char)prevCC;
}
/* the composition is a Hangul LV syllable which is a starter that combines forward */
combineFwdIndex=0xfff0;
/* we combined; continue with looking for compositions */
continue;
}
}
/*
* now: cc==0 and the combining index does not include
* "forward" -> the rest of the loop body will reset starter
* to NULL; technically, a composed Hangul syllable is a
* starter, but it does not combine forward now that we have
* consumed all eligible Jamos; for Jamo V/T, combineFlags
* does not contain _NORM_COMBINES_FWD
*/
} else if(
/* the starter is not a Hangul LV or Jamo V/T and */
/* the combining mark is not blocked and */
/* the starter and the combining mark (c, c2) do combine */
combineBackIndex, outValues)) &&
/* the composition result is not excluded */
) {
/* replace the starter with the composition, remove the
* combining mark
*/
/* replace the starter with the composition */
if(starterIsSupplementary) {
if(value2!=0) {
/* both are supplementary */
} else {
/* the composition is shorter than the starter,
* move the intermediate characters forward one */
starterIsSupplementary=false;
q=starter+1;
r=q+1;
while(r<remove) {
}
--remove;
}
starterIsSupplementary=true;
/* } else { both are on the BMP, nothing more to do */
}
/* remove the combining mark by moving the following text
* over it */
q=remove;
}
}
/* keep prevCC because we removed the combining mark */
/* done? */
return (char)prevCC;
}
/* is the composition a starter that combines forward? */
if(result>1) {
(char)value2);
} else {
starter=-1;
}
/* we combined; continue with looking for compositions */
continue;
}
}
/* no combination this time */
return (char)prevCC;
}
/* if (c, c2) did not combine, then check if it is a starter */
/* found a new starter; combineFlags==0 if (c, c2) is excluded */
/* it may combine with something, prepare for it */
starterIsSupplementary=false;
} else {
starterIsSupplementary=false;
}
} else {
/* it will not combine with anything */
starter=-1;
}
/* FCC: no discontiguous compositions; any intervening character blocks */
starter=-1;
}
}
}
// find the last true starter between src[start]....src[current] going
// backwards and return its index
int/*unsigned*/ ccOrQCMask,
int/*unsigned*/ decompQCMask,
char minNoMaybe) {
long norm32;
break;
}
}
}
/* find the first true starter in [src..limit[ and return the
* pointer to it
*/
int/*unsigned*/ qcMask,
int/*unsigned*/ decompQCMask,
char minNoMaybe) {
int p;
long/*unsigned*/ norm32;
int ccOrQCMask;
char c, c2;
for(;;) {
break; /* end of string */
}
if(c<minNoMaybe) {
break; /* catches NUL terminater, too */
}
break; /* true starter */
}
if(isNorm32LeadSurrogate(norm32)) {
/* c is a lead surrogate, get the real norm32 */
/* unmatched first surrogate: counts as a true starter */
break;
}
break; /* true starter */
}
} else {
c2=0;
}
/* (c, c2) is not a true starter but its decomposition may be */
/* (c, c2) decomposes, get everything from the variable-length
* extra data */
/* get the first character's norm32 to check if it is a true
* starter */
break; /* true starter */
}
}
}
return start;
}
private static final class ComposePartArgs{
int prevCC;
}
/* decompose and recompose [prevStarter..src[ */
int prevStarter,
int options,
UnicodeSet nx) {
int recomposeLimit;
/* decompose [prevStarter..src[ */
int[] outTrailCC = new int[1];
for(;;){
break;
}else{
}
}
/* recompose the decomposition */
}
/* return with a pointer to the recomposition and its length */
return buffer;
}
long/*unsigned*/ norm32,
boolean compat,
UnicodeSet nx) {
if(isJamoVTNorm32JamoV(norm32)) {
/* c is a Jamo V, compose with previous Jamo L and
* following Jamo T */
if(prev<JAMO_L_COUNT) {
(c-JAMO_V_BASE))*JAMO_T_COUNT);
/* check if the next character is a Jamo T (normal or
* compatibility) */
char next, t;
/* normal Jamo T */
++start;
c+=t;
} else if(compat) {
/* if NFKC, then check for compatibility Jamo T
* (BMP only) */
int p /*index into extra data array*/;
(t=(char)(extraData[p]-JAMO_T_BASE))
<JAMO_T_COUNT) {
/* compatibility Jamo T */
++start;
c+=t;
}
}
}
}
if(nx_contains(nx, c)) {
if(!isHangulWithoutJamoT(c)) {
--start; /* undo ++start from reading the Jamo T */
}
return false;
}
return true;
}
} else if(isHangulWithoutJamoT(prev)) {
/* c is a Jamo T, compose with previous Hangul LV that does not
* contain a Jamo T */
c=(char)(prev+(c-JAMO_T_BASE));
if(nx_contains(nx, c)) {
return false;
}
return true;
}
return false;
}
/*
public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
return compose(src,0,src.length,dest,0,dest.length,compat, nx);
}
*/
int prevSrc, prevStarter;
long/*unsigned*/ norm32;
int ccOrQCMask, qcMask;
int reorderStartIndex, length;
char c, c2, minNoMaybe;
int[] ioIndex = new int[1];
} else {
}
/*
* prevStarter points to the last character before the current one
* that is a "true" starter with cc==0 and quick check "yes".
*
* prevStarter will be used instead of looking for a true starter
* while incrementally decomposing [prevStarter..prevSrc[
* in _composePart(). Having a good prevStarter allows to just decompose
* the entire [prevStarter..prevSrc[.
*
* When _composePart() backs out from prevSrc back to prevStarter,
* then it also backs out destIndex by the same amount.
* Therefore, at all times, the (prevSrc-prevStarter) source units
* must correspond 1:1 to destination units counted with destIndex,
* except for reordering.
* This is true for the qc "yes" characters copied in the fast loop,
* and for pure reordering.
* prevStarter must be set forward to src when this is not true:
* In _composePart() and after composing a Hangul syllable.
*
* This mechanism relies on the assumption that the decomposition of a
* for this.
*/
prevCC=0;
/* avoid compiler warnings */
norm32=0;
c=0;
for(;;) {
/* count code units below the minimum or with irrelevant data for
* the quick check */
prevCC=0;
++srcIndex;
}
/* copy these code units all at once */
}
/* set prevStarter to the last character in the quick check
* loop */
--prevStarter;
}
}
/* end of source reached? */
break;
}
/* c already contains *src and norm32 is set for it, increment src*/
++srcIndex;
/*
* source buffer pointers:
*
* all done quick check current char not yet
* "yes" but (c, c2) processed
* may combine
* forward
* [-------------[-------------[-------------[-------------[
* | | | | |
* start prevStarter prevSrc src limit
*
*
* destination buffer pointers and indexes:
*
* all done might take not filled yet
* characters for
* reordering
* [-------------[-------------[-------------[
* | | | |
* dest reorderStartIndex destIndex destCapacity
*/
/* check one above-minimum, relevant code unit */
/*
* norm32 is for c=*(src-1), and the quick check flag is "no" or
* check for Jamo V/T, then for surrogates and regular characters
* c is not a Hangul syllable or Jamo L because
*/
if(isNorm32HangulOrJamo(norm32)) {
/*
* c is a Jamo V/T:
* try to compose with the previous character, Jamo V also with
* a following Jamo T, and set values here right now in case we
* just continue with the main loop
*/
if(
destIndex>0 &&
nx)
) {
continue;
}
/* the Jamo V/T did not compose into a Hangul syllable, just
* append to dest */
c2=0;
length=1;
} else {
if(isNorm32Regular(norm32)) {
c2=0;
length=1;
} else {
/* c is a lead surrogate, get the real norm32 */
++srcIndex;
length=2;
} else {
/* c is an unpaired lead surrogate, nothing to do */
c2=0;
length=1;
norm32=0;
}
}
/* we are looking at the character (c, c2) at [prevSrc..src[ */
/* excluded: norm32==0 */
cc=0;
} else {
char[] p;
/*
* find appropriate boundaries around this character,
* decompose the source text from between the boundaries,
* and recompose it
*
* this puts the intermediate text into the side buffer because
* it might be longer than the recomposition end result,
* or the destination buffer may be too short or missing
*
* note that destIndex may be adjusted backwards to account
* for source text that passed the quick check but needed to
* take part in the recomposition
*/
/*
* find the last true starter in [prevStarter..src[
* it is either the decomposition of the current character (at prevSrc),
* or prevStarter
*/
} else {
/* adjust destIndex: back out what had been copied with qc "yes" */
}
/* find the next true starter in [src..limit[ */
//args.prevStarter = prevStarter;
//args.destIndex = destIndex;
if(p==null) {
/* an error occurred (out of memory) */
break;
}
/* append the recomposed buffer contents to the destination
* buffer */
int i=0;
--length;
}
} else {
/* buffer overflow */
/* keep incrementing the destIndex for preflighting */
}
continue;
}
}
/* append the single code point (c, c2) to the destination buffer */
/* (c, c2) is out of order with respect to the preceding
* text */
int reorderSplit= destIndex;
} else {
/* just append (c, c2) */
if(c2!=0) {
}
}
} else {
/* buffer overflow */
/* keep incrementing the destIndex for preflighting */
}
}
}
public static int getCombiningClass(int c) {
long norm32;
}
public static boolean isFullCompositionExclusion(int c) {
if(isFormatVersion_2_1) {
} else {
return false;
}
}
public static boolean isCanonSafeStart(int c) {
if(isFormatVersion_2_1) {
} else {
return false;
}
}
/* Is c an NF<mode>-skippable code point? See unormimp.h. */
long /*unsigned int*/ norm32;
char aux;
/* check conditions (a)..(e), see unormimp.h */
return false; /* fails (a)..(e), not skippable */
}
return true; /* NF*D, passed (a)..(c), is skippable */
}
/* check conditions (a)..(e), see unormimp.h */
return true; /* no canonical decomposition, is skippable */
}
/* check Hangul syllables algorithmically */
if(isNorm32HangulOrJamo(norm32)) {
/* Jamo passed (a)..(e) above, must be Hangul */
return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */
}
/* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
/* NF*C, test (f) flag */
if(!isFormatVersion_2_2) {
return false; /* no (f) data, say not skippable to be safe */
}
/* } else { FCC, test fcd<=1 instead of the above } */
}
int c;
/* add the start code point of each same-value range of each trie */
//utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
}
//utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
}
if(isFormatVersion_2_1){
//utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
}
}
/* add Hangul LV syllables and LV+1 because of skippables */
}
return set; // for chaining
}
/**
* Internal API, used in UCharacter.getIntPropertyValue().
* @internal
* @param c code point
* @param modeValue numeric value compatible with Mode
* @return numeric value compatible with QuickCheck
*/
final int qcMask[/*UNORM_MODE_COUNT*/]={
};
if(norm32==0) {
return 1; // YES
return 0; // NO
} else /* _NORM_QC_ANY_MAYBE */ {
return 2; // MAYBE;
}
}
boolean codePointOrder) {
/* setup for fix-up */
int lengthResult;
lengthResult=-1;
lengthResult=0;
} else /* length1>length2 */ {
lengthResult=1;
}
return lengthResult;
}
for(;;) {
/* check pseudo-limit */
return lengthResult;
}
break;
}
++s1Start;
++s2Start;
}
/* setup for fix-up */
/* if both values are in or above the surrogate range, fix them up */
/* subtract 0x2800 from BMP code points to make them smaller than
* supplementary ones */
if(
) ||
)
) {
/* part of a surrogate pair, leave >=d800 */
} else {
/* BMP code point - may be surrogate code point - make <d800 */
c1-=0x2800;
}
if(
) ||
)
) {
/* part of a surrogate pair, leave >=d800 */
} else {
/* BMP code point - may be surrogate code point - make <d800 */
c2-=0x2800;
}
}
/* now c1 and c2 are in UTF-32-compatible order */
}
/*
* Status of tailored normalization
*
* This was done initially for investigation on Unicode public review issue 7
* (http://www.unicode.org/review/). See Jitterbug 2481.
* While the UTC at meeting #94 (2003mar) did not take up the issue, this is
* a permanent feature in ICU 2.6 in support of IDNA which requires true
* Unicode 3.2 normalization.
* (NormalizationCorrections are rolled into IDNA mapping tables.)
*
* Tailored normalization as implemented here allows to "normalize less"
* than full Unicode normalization would.
* Based internally on a UnicodeSet of code points that are
* "excluded from normalization", the normalization functions leave those
* code points alone ("inert"). This means that tailored normalization
* still transforms text into a canonically equivalent form.
* It does not add decompositions to code points that do not have any or
* change decomposition results.
*
* Any function that searches for a safe boundary has not been touched,
* which means that these functions will be over-pessimistic when
* exclusions are applied.
* This should not matter because subsequent checks and normalizations
* do apply the exclusions; only a little more of the text may be processed
* than necessary under exclusions.
*
* Normalization exclusions have the following effect on excluded code points c:
* - c is not decomposed
* - c is not a composition target
* - c does not combine forward or backward for composition
* except that this is not implemented for Jamo
* - c is treated as having a combining class of 0
*/
/*
* Constants for the bit fields in the options bit set parameter.
* These need not be public.
* A user only needs to know the currently assigned values.
* The number and positions of reserved bits per field can remain private.
*/
// private static final int OPTIONS_UNICODE_SHIFT=5;
/* Constants for options flags for normalization.*/
/**
* Options bit 0, do not decompose Hangul syllables.
* @draft ICU 2.6
*/
/**
* Options bit 1, do not decompose CJK compatibility characters.
* @draft ICU 2.6
*/
/**
* Options bit 8, use buggy recomposition described in
* Unicode Public Review Issue #29
*
* Used in IDNA implementation according to strict interpretation
* of IDNA definition based on Unicode 3.2 which predates PRI #29.
*
* See ICU4C unormimp.h
*
* @draft ICU 3.2
*/
/*
* The following options are used only in some composition functions.
* They use bits 12 and up to preserve lower bits for the available options
* space in unorm_compare() -
* see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
*/
/** Options bit 12, for compatibility vs. canonical decomposition. */
/** Options bit 13, no discontiguous composition (FCC vs. NFC). */
/* normalization exclusion sets --------------------------------------------- */
/*
* Normalization exclusion UnicodeSets are used for tailored normalization;
* see the comment near the beginning of this file.
*
* By specifying one or several sets of code points,
* those code points become inert for normalization.
*/
/* internal function, does not check for incoming U_FAILURE */
}
}
/* internal function, does not check for incoming U_FAILURE */
/* build a set from [CJK Ideographs]&[has canonical decomposition] */
/* start with an empty set for [has canonical decomposition] */
hasDecomp=new UnicodeSet();
/* iterate over all ideographs and remember which canonically decompose */
long norm32;
}
++start;
}
}
/* hasDecomp now contains all ideographs that decompose canonically */
}
return nxCache[NX_CJK_COMPAT];
}
if(options==0) {
return null;
}
/* build a set with all code points that were not designated by the specified Unicode version */
switch(options) {
case NormalizerBase.UNICODE_3_2:
break;
default:
return null;
}
}
}
/* Get a decomposition exclusion set. The data must be loaded. */
/* return basic sets */
return internalGetNXHangul();
}
if(options==NX_CJK_COMPAT) {
return internalGetNXCJKCompat();
}
return internalGetNXUnicode(options);
}
/* build a set from multiple subsets */
set=new UnicodeSet();
}
}
}
}
}
/* incoming failure, or no decomposition exclusions requested */
return null;
} else {
return internalGetNX(options);
}
}
}
}
/*****************************************************************************/
/**
* Get the canonical decomposition
* sherman for ComposedCharIter
*/
int length=0;
long norm32 = 0;
int ch = -1;
int index = 0;
int i = 0;
//TBD !!!! the hack code heres save us about 50ms for startup
if (ch == 0x30ff)
ch = 0xf900;
else if (ch == 0x10000)
ch = 0x1d15e;
else if (ch == 0x1d1c1)
ch = 0x2f800;
}
}
return i;
}
//------------------------------------------------------
// special method for Collation
//------------------------------------------------------
private static boolean needSingleQuotation(char c) {
return (c >= 0x0009 && c <= 0x000D) ||
(c >= 0x0020 && c <= 0x002F) ||
(c >= 0x003A && c <= 0x0040) ||
(c >= 0x005B && c <= 0x0060) ||
(c >= 0x007B && c <= 0x007E);
}
int srcIndex = 0;
int destIndex = 0;
char[] buffer = new char[3];
int prevSrc;
long norm32;
int ccOrQCMask;
int reorderStartIndex, length;
char c, c2;
char[] p;
int pStart;
// initialize
reorderStartIndex = 0;
prevCC = 0;
norm32 = 0;
c = 0;
pStart = 0;
for(;;) {
//quick check (1)less than minNoMaybe (2)no decomp (3)hangual
( c >= '\uac00' && c <= '\ud7a3'))){
prevCC = 0;
++srcIndex;
}
// copy these code units all at once
}
}
// end of source reached?
break;
}
// c already contains *src and norm32 is set for it, increment src
++srcIndex;
if(isNorm32Regular(norm32)) {
c2 = 0;
length = 1;
} else {
// c is a lead surrogate, get the real norm32
++srcIndex;
length = 2;
} else {
c2 = 0;
length = 1;
norm32 = 0;
}
}
// get the decomposition and the lead and trail cc's
// c does not decompose
p = null;
pStart = -1;
} else {
// c decomposes, get everything from the variable-length
// extra data
p = extraData;
if(length == 1) {
// fastpath a single code unit from decomposition
c = p[pStart];
c2 = 0;
p = null;
pStart = -1;
}
}
// buffer overflow
}
// append the decomposition to the destination buffer, assume length>0
{
int reorderSplit = destIndex;
if(p == null) {
// fastpath: single code point
if (needSingleQuotation(c)) {
//if we need single quotation, no need to consider "prevCC"
//and it must NOT be a supplementary pair
trailCC = 0;
// (c, c2) is out of order with respect to the preceding
// text
} else {
// just append (c, c2)
if(c2 != 0) {
}
}
} else {
// general: multiple code points (ordered by themselves)
// from decomposition
if (needSingleQuotation(p[pStart])) {
length--;
do {
} while(--length > 0);
} else
} else {
// just append the decomposition
do {
} while(--length > 0);
}
}
}
if(prevCC == 0) {
}
}
}
//------------------------------------------------------
// mapping method for IDNA/StringPrep
//------------------------------------------------------
/*
* Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode
* 3.2 normalization with Corrigendum 4 corrections. However, normalization
* without the corrections is necessary for IDNA/StringPrep support.
* This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option
* (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five
* characters in Corrigendum 4 before normalization in order to avoid
* incorrect normalization.
* For the Corrigendum 4 issue, refer
*/
/*
* Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL.
*/
private static final char[][] corrigendum4MappingTable = {
{'\uD844', '\uDF6A'}, // 0x2F868
{'\u5F33'}, // 0x2F874
{'\u43AB'}, // 0x2F91F
{'\u7AAE'}, // 0x2F95F
{'\u4D57'}}; // 0x2F9BF
/*
* Removing Corrigendum 4 fix
* @return normalized text
*/
return null;
}
switch (ch) {
case 0x2F868:
break;
case 0x2F874:
break;
case 0x2F91F:
break;
case 0x2F95F:
break;
case 0x2F9BF:
break;
default:
break;
}
}
}
}