text/normalizer/UTF16.java

0N/A/*
2362N/A * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0N/A *
0N/A * This code is free software; you can redistribute it and/or modify it
0N/A * under the terms of the GNU General Public License version 2 only, as
2362N/A * published by the Free Software Foundation.  Oracle designates this
0N/A * particular file as subject to the "Classpath" exception as provided
2362N/A * by Oracle in the LICENSE file that accompanied this code.
0N/A *
0N/A * This code is distributed in the hope that it will be useful, but WITHOUT
0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0N/A * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
0N/A * version 2 for more details (a copy is included in the LICENSE file that
0N/A * accompanied this code).
0N/A *
0N/A * You should have received a copy of the GNU General Public License version
0N/A * 2 along with this work; if not, write to the Free Software Foundation,
0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0N/A *
2362N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2362N/A * or visit www.oracle.com if you need additional information or have any
2362N/A * questions.
0N/A */
0N/A/*
0N/A *******************************************************************************
1091N/A * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
0N/A *                                                                             *
0N/A * The original version of this source code and documentation is copyrighted   *
0N/A * and owned by IBM, These materials are provided under terms of a License     *
0N/A * Agreement between IBM and Sun. This technology is protected by multiple     *
0N/A * US and International patents. This notice and attribution to IBM may not    *
0N/A * to removed.                                                                 *
0N/A *******************************************************************************
0N/A */
0N/A
0N/Apackage sun.text.normalizer;
0N/A
0N/A/**
0N/A * <p>Standalone utility class providing UTF16 character conversions and
0N/A * indexing conversions.</p>
0N/A * <p>Code that uses strings alone rarely need modification.
0N/A * By design, UTF-16 does not allow overlap, so searching for strings is a safe
0N/A * operation. Similarly, concatenation is always safe. Substringing is safe if
0N/A * the start and end are both on UTF-32 boundaries. In normal code, the values
0N/A * for start and end are on those boundaries, since they arose from operations
0N/A * like searching. If not, the nearest UTF-32 boundaries can be determined
0N/A * using <code>bounds()</code>.</p>
0N/A * <strong>Examples:</strong>
0N/A * <p>The following examples illustrate use of some of these methods.
0N/A * <pre>
0N/A * // iteration forwards: Original
0N/A * for (int i = 0; i &lt; s.length(); ++i) {
0N/A *     char ch = s.charAt(i);
0N/A *     doSomethingWith(ch);
0N/A * }
0N/A *
0N/A * // iteration forwards: Changes for UTF-32
0N/A * int ch;
0N/A * for (int i = 0; i &lt; s.length(); i+=UTF16.getCharCount(ch)) {
0N/A *     ch = UTF16.charAt(s,i);
0N/A *     doSomethingWith(ch);
0N/A * }
0N/A *
0N/A * // iteration backwards: Original
0N/A * for (int i = s.length() -1; i >= 0; --i) {
0N/A *     char ch = s.charAt(i);
0N/A *     doSomethingWith(ch);
0N/A * }
0N/A *
0N/A * // iteration backwards: Changes for UTF-32
0N/A * int ch;
0N/A * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
0N/A *     ch = UTF16.charAt(s,i);
0N/A *     doSomethingWith(ch);
0N/A * }
0N/A * </pre>
0N/A * <strong>Notes:</strong>
0N/A * <ul>
0N/A *   <li>
0N/A *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
0N/A *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
0N/A *   sense of their ordering in a string. <code>offset16</code> and
0N/A *   <code>offset32</code> are used to distinguish offsets to UTF-16
0N/A *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
0N/A *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
0N/A *   which is a UTF-16 code unit.
0N/A *   </li>
0N/A *   <li>
0N/A *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
0N/A *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
0N/A *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
0N/A *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
0N/A *   </li>
0N/A *   <li>
0N/A *    <strong>Exceptions:</strong> The error checking will throw an exception
0N/A *   if indices are out of bounds. Other than than that, all methods will
0N/A *   behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
0N/A *   values are present. <code>UCharacter.isLegal()</code> can be used to check
0N/A *   for validity if desired.
0N/A *   </li>
0N/A *   <li>
0N/A *   <strong>Unmatched Surrogates:</strong> If the string contains unmatched
0N/A *   surrogates, then these are counted as one UTF-32 value. This matches
0N/A *   their iteration behavior, which is vital. It also matches common display
0N/A *   practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
0N/A *   </li>
0N/A *   <li>
0N/A *     <strong>Optimization:</strong> The method implementations may need
0N/A *     optimization if the compiler doesn't fold static final methods. Since
0N/A *     surrogate pairs will form an exceeding small percentage of all the text
0N/A *     in the world, the singleton case should always be optimized for.
0N/A *   </li>
0N/A * </ul>
0N/A * @author Mark Davis, with help from Markus Scherer
0N/A * @stable ICU 2.1
0N/A */
0N/A
0N/Apublic final class UTF16
0N/A{
0N/A    // public variables ---------------------------------------------------
0N/A
0N/A    /**
0N/A     * The lowest Unicode code point value.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int CODEPOINT_MIN_VALUE = 0;
0N/A    /**
0N/A     * The highest Unicode code point value (scalar value) according to the
0N/A     * Unicode Standard.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
0N/A    /**
0N/A     * The minimum value for Supplementary code points
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int SUPPLEMENTARY_MIN_VALUE  = 0x10000;
0N/A    /**
0N/A     * Lead surrogate minimum value
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
0N/A    /**
0N/A     * Trail surrogate minimum value
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
0N/A    /**
0N/A     * Lead surrogate maximum value
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
0N/A    /**
0N/A     * Trail surrogate maximum value
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
0N/A    /**
0N/A     * Surrogate minimum value
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
0N/A
0N/A    // public method ------------------------------------------------------
0N/A
0N/A    /**
0N/A     * Extract a single UTF-32 value from a string.
0N/A     * Used when iterating forwards or backwards (with
0N/A     * <code>UTF16.getCharCount()</code>, as well as random access. If a
0N/A     * validity check is required, use
0N/A     * <code><a href="../lang/UCharacter.html#isLegal(char)">
0N/A     * UCharacter.isLegal()</a></code> on the return value.
0N/A     * If the char retrieved is part of a surrogate pair, its supplementary
0N/A     * character will be returned. If a complete supplementary character is
0N/A     * not found the incomplete character will be returned
0N/A     * @param source array of UTF-16 chars
0N/A     * @param offset16 UTF-16 offset to the start of the character.
0N/A     * @return UTF-32 value for the UTF-32 value that contains the char at
0N/A     *         offset16. The boundaries of that codepoint are the same as in
0N/A     *         <code>bounds32()</code>.
0N/A     * @exception IndexOutOfBoundsException thrown if offset16 is out of
0N/A     *            bounds.
0N/A     * @stable ICU 2.1
0N/A     */
1091N/A    public static int charAt(String source, int offset16) {
1091N/A        char single = source.charAt(offset16);
1091N/A        if (single < LEAD_SURROGATE_MIN_VALUE) {
1091N/A            return single;
0N/A        }
1091N/A        return _charAt(source, offset16, single);
1091N/A    }
0N/A
1091N/A    private static int _charAt(String source, int offset16, char single) {
1091N/A        if (single > TRAIL_SURROGATE_MAX_VALUE) {
0N/A            return single;
0N/A        }
0N/A
0N/A        // Convert the UTF-16 surrogate pair if necessary.
0N/A        // For simplicity in usage, and because the frequency of pairs is
0N/A        // low, look both directions.
0N/A
0N/A        if (single <= LEAD_SURROGATE_MAX_VALUE) {
1091N/A            ++offset16;
0N/A            if (source.length() != offset16) {
0N/A                char trail = source.charAt(offset16);
1091N/A                if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
1091N/A                    return UCharacterProperty.getRawSupplementary(single, trail);
1091N/A                }
1091N/A            }
1091N/A        } else {
1091N/A            --offset16;
1091N/A            if (offset16 >= 0) {
1091N/A                // single is a trail surrogate so
1091N/A                char lead = source.charAt(offset16);
1091N/A                if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
1091N/A                    return UCharacterProperty.getRawSupplementary(lead, single);
0N/A                }
0N/A            }
0N/A        }
0N/A        return single; // return unmatched surrogate
0N/A    }
0N/A
0N/A    /**
0N/A     * Extract a single UTF-32 value from a substring.
0N/A     * Used when iterating forwards or backwards (with
0N/A     * <code>UTF16.getCharCount()</code>, as well as random access. If a
0N/A     * validity check is required, use
0N/A     * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
0N/A     * </a></code> on the return value.
0N/A     * If the char retrieved is part of a surrogate pair, its supplementary
0N/A     * character will be returned. If a complete supplementary character is
0N/A     * not found the incomplete character will be returned
0N/A     * @param source array of UTF-16 chars
0N/A     * @param start offset to substring in the source array for analyzing
0N/A     * @param limit offset to substring in the source array for analyzing
0N/A     * @param offset16 UTF-16 offset relative to start
0N/A     * @return UTF-32 value for the UTF-32 value that contains the char at
0N/A     *         offset16. The boundaries of that codepoint are the same as in
0N/A     *         <code>bounds32()</code>.
0N/A     * @exception IndexOutOfBoundsException thrown if offset16 is not within
0N/A     *            the range of start and limit.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static int charAt(char source[], int start, int limit,
0N/A                             int offset16)
0N/A    {
0N/A        offset16 += start;
0N/A        if (offset16 < start || offset16 >= limit) {
0N/A            throw new ArrayIndexOutOfBoundsException(offset16);
0N/A        }
0N/A
0N/A        char single = source[offset16];
0N/A        if (!isSurrogate(single)) {
0N/A            return single;
0N/A        }
0N/A
0N/A        // Convert the UTF-16 surrogate pair if necessary.
0N/A        // For simplicity in usage, and because the frequency of pairs is
0N/A        // low, look both directions.
0N/A        if (single <= LEAD_SURROGATE_MAX_VALUE) {
0N/A            offset16 ++;
0N/A            if (offset16 >= limit) {
0N/A                return single;
0N/A            }
0N/A            char trail = source[offset16];
0N/A            if (isTrailSurrogate(trail)) {
0N/A                return UCharacterProperty.getRawSupplementary(single, trail);
0N/A            }
0N/A        }
0N/A        else { // isTrailSurrogate(single), so
0N/A            if (offset16 == start) {
0N/A                return single;
0N/A            }
0N/A            offset16 --;
0N/A            char lead = source[offset16];
0N/A            if (isLeadSurrogate(lead))
0N/A                return UCharacterProperty.getRawSupplementary(lead, single);
0N/A        }
0N/A        return single; // return unmatched surrogate
0N/A    }
0N/A
0N/A    /**
0N/A     * Determines how many chars this char32 requires.
0N/A     * If a validity check is required, use <code>
0N/A     * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
0N/A     * char32 before calling.
0N/A     * @param char32 the input codepoint.
0N/A     * @return 2 if is in supplementary space, otherwise 1.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static int getCharCount(int char32)
0N/A    {
0N/A        if (char32 < SUPPLEMENTARY_MIN_VALUE) {
0N/A            return 1;
0N/A        }
0N/A        return 2;
0N/A    }
0N/A
0N/A    /**
0N/A     * Determines whether the code value is a surrogate.
0N/A     * @param char16 the input character.
0N/A     * @return true iff the input character is a surrogate.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static boolean isSurrogate(char char16)
0N/A    {
0N/A        return LEAD_SURROGATE_MIN_VALUE <= char16 &&
0N/A            char16 <= TRAIL_SURROGATE_MAX_VALUE;
0N/A    }
0N/A
0N/A    /**
0N/A     * Determines whether the character is a trail surrogate.
0N/A     * @param char16 the input character.
0N/A     * @return true iff the input character is a trail surrogate.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static boolean isTrailSurrogate(char char16)
0N/A    {
0N/A        return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
0N/A                char16 <= TRAIL_SURROGATE_MAX_VALUE);
0N/A    }
0N/A
0N/A    /**
0N/A     * Determines whether the character is a lead surrogate.
0N/A     * @param char16 the input character.
0N/A     * @return true iff the input character is a lead surrogate
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static boolean isLeadSurrogate(char char16)
0N/A    {
0N/A        return LEAD_SURROGATE_MIN_VALUE <= char16 &&
0N/A            char16 <= LEAD_SURROGATE_MAX_VALUE;
0N/A    }
0N/A
0N/A    /**
0N/A     * Returns the lead surrogate.
0N/A     * If a validity check is required, use
0N/A     * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
0N/A     * on char32 before calling.
0N/A     * @param char32 the input character.
0N/A     * @return lead surrogate if the getCharCount(ch) is 2; <br>
0N/A     *         and 0 otherwise (note: 0 is not a valid lead surrogate).
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static char getLeadSurrogate(int char32)
0N/A    {
0N/A        if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
0N/A            return (char)(LEAD_SURROGATE_OFFSET_ +
0N/A                          (char32 >> LEAD_SURROGATE_SHIFT_));
0N/A        }
0N/A
0N/A        return 0;
0N/A    }
0N/A
0N/A    /**
0N/A     * Returns the trail surrogate.
0N/A     * If a validity check is required, use
0N/A     * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
0N/A     * on char32 before calling.
0N/A     * @param char32 the input character.
0N/A     * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
0N/A     *         the character itself
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static char getTrailSurrogate(int char32)
0N/A    {
0N/A        if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
0N/A            return (char)(TRAIL_SURROGATE_MIN_VALUE +
0N/A                          (char32 & TRAIL_SURROGATE_MASK_));
0N/A        }
0N/A
0N/A        return (char)char32;
0N/A    }
0N/A
0N/A    /**
0N/A     * Convenience method corresponding to String.valueOf(char). Returns a one
0N/A     * or two char string containing the UTF-32 value in UTF16 format. If a
0N/A     * validity check is required, use
0N/A     * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
0N/A     * on char32 before calling.
0N/A     * @param char32 the input character.
0N/A     * @return string value of char32 in UTF16 format
0N/A     * @exception IllegalArgumentException thrown if char32 is a invalid
0N/A     *            codepoint.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static String valueOf(int char32)
0N/A    {
0N/A        if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
0N/A            throw new IllegalArgumentException("Illegal codepoint");
0N/A        }
0N/A        return toString(char32);
0N/A    }
0N/A
0N/A    /**
0N/A     * Append a single UTF-32 value to the end of a StringBuffer.
0N/A     * If a validity check is required, use
0N/A     * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
0N/A     * on char32 before calling.
0N/A     * @param target the buffer to append to
0N/A     * @param char32 value to append.
0N/A     * @return the updated StringBuffer
0N/A     * @exception IllegalArgumentException thrown when char32 does not lie
0N/A     *            within the range of the Unicode codepoints
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static StringBuffer append(StringBuffer target, int char32)
0N/A    {
0N/A        // Check for irregular values
0N/A        if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
0N/A            throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
0N/A        }
0N/A
0N/A        // Write the UTF-16 values
0N/A        if (char32 >= SUPPLEMENTARY_MIN_VALUE)
0N/A            {
0N/A                target.append(getLeadSurrogate(char32));
0N/A                target.append(getTrailSurrogate(char32));
0N/A            }
0N/A        else {
0N/A            target.append((char)char32);
0N/A        }
0N/A        return target;
0N/A    }
0N/A
0N/A    //// for StringPrep
0N/A    /**
0N/A     * Shifts offset16 by the argument number of codepoints within a subarray.
0N/A     * @param source char array
0N/A     * @param start position of the subarray to be performed on
0N/A     * @param limit position of the subarray to be performed on
0N/A     * @param offset16 UTF16 position to shift relative to start
0N/A     * @param shift32 number of codepoints to shift
0N/A     * @return new shifted offset16 relative to start
0N/A     * @exception IndexOutOfBoundsException if the new offset16 is out of
0N/A     *            bounds with respect to the subarray or the subarray bounds
0N/A     *            are out of range.
0N/A     * @stable ICU 2.1
0N/A     */
0N/A    public static int moveCodePointOffset(char source[], int start, int limit,
0N/A                                          int offset16, int shift32)
0N/A    {
0N/A        int         size = source.length;
0N/A        int         count;
0N/A        char        ch;
0N/A        int         result = offset16 + start;
0N/A        if (start<0 || limit<start) {
0N/A            throw new StringIndexOutOfBoundsException(start);
0N/A        }
0N/A        if (limit>size) {
0N/A            throw new StringIndexOutOfBoundsException(limit);
0N/A        }
0N/A        if (offset16<0 || result>limit) {
0N/A            throw new StringIndexOutOfBoundsException(offset16);
0N/A        }
0N/A        if (shift32 > 0 ) {
0N/A            if (shift32 + result > size) {
0N/A                throw new StringIndexOutOfBoundsException(result);
0N/A            }
0N/A            count = shift32;
0N/A            while (result < limit && count > 0)
0N/A            {
0N/A                ch = source[result];
0N/A                if (isLeadSurrogate(ch) && (result+1 < limit) &&
0N/A                        isTrailSurrogate(source[result+1])) {
0N/A                    result ++;
0N/A                }
0N/A                count --;
0N/A                result ++;
0N/A            }
0N/A        } else {
0N/A            if (result + shift32 < start) {
0N/A                throw new StringIndexOutOfBoundsException(result);
0N/A            }
0N/A            for (count=-shift32; count>0; count--) {
0N/A                result--;
0N/A                if (result<start) {
0N/A                    break;
0N/A                }
0N/A                ch = source[result];
0N/A                if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
0N/A                    result--;
0N/A                }
0N/A            }
0N/A        }
0N/A        if (count != 0)  {
0N/A            throw new StringIndexOutOfBoundsException(shift32);
0N/A        }
0N/A        result -= start;
0N/A        return result;
0N/A    }
0N/A
0N/A    // private data members -------------------------------------------------
0N/A
0N/A    /**
0N/A     * Shift value for lead surrogate to form a supplementary character.
0N/A     */
0N/A    private static final int LEAD_SURROGATE_SHIFT_ = 10;
0N/A
0N/A    /**
0N/A     * Mask to retrieve the significant value from a trail surrogate.
0N/A     */
0N/A    private static final int TRAIL_SURROGATE_MASK_     = 0x3FF;
0N/A
0N/A    /**
0N/A     * Value that all lead surrogate starts with
0N/A     */
0N/A    private static final int LEAD_SURROGATE_OFFSET_ =
0N/A        LEAD_SURROGATE_MIN_VALUE -
0N/A        (SUPPLEMENTARY_MIN_VALUE
0N/A         >> LEAD_SURROGATE_SHIFT_);
0N/A
0N/A    // private methods ------------------------------------------------------
0N/A
0N/A    /**
0N/A     * <p>Converts argument code point and returns a String object representing
0N/A     * the code point's value in UTF16 format.</p>
0N/A     * <p>This method does not check for the validity of the codepoint, the
0N/A     * results are not guaranteed if a invalid codepoint is passed as
0N/A     * argument.</p>
0N/A     * <p>The result is a string whose length is 1 for non-supplementary code
0N/A     * points, 2 otherwise.</p>
0N/A     * @param ch code point
0N/A     * @return string representation of the code point
0N/A     */
0N/A    private static String toString(int ch)
0N/A    {
0N/A        if (ch < SUPPLEMENTARY_MIN_VALUE) {
0N/A            return String.valueOf((char)ch);
0N/A        }
0N/A
0N/A        StringBuffer result = new StringBuffer();
0N/A        result.append(getLeadSurrogate(ch));
0N/A        result.append(getTrailSurrogate(ch));
0N/A        return result.toString();
0N/A    }
0N/A}