0N/A/*
2362N/A * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
0N/A * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
0N/A *
0N/A * This code is free software; you can redistribute it and/or modify it
0N/A * under the terms of the GNU General Public License version 2 only, as
2362N/A * published by the Free Software Foundation. Oracle designates this
0N/A * particular file as subject to the "Classpath" exception as provided
2362N/A * by Oracle in the LICENSE file that accompanied this code.
0N/A *
0N/A * This code is distributed in the hope that it will be useful, but WITHOUT
0N/A * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0N/A * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
0N/A * version 2 for more details (a copy is included in the LICENSE file that
0N/A * accompanied this code).
0N/A *
0N/A * You should have received a copy of the GNU General Public License version
0N/A * 2 along with this work; if not, write to the Free Software Foundation,
0N/A * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
0N/A *
2362N/A * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2362N/A * or visit www.oracle.com if you need additional information or have any
2362N/A * questions.
0N/A */
0N/Apackage java.net;
0N/A
0N/Aimport java.io.InputStream;
0N/Aimport java.io.IOException;
0N/Aimport java.security.AccessController;
0N/Aimport java.security.PrivilegedAction;
0N/A
0N/Aimport sun.net.idn.StringPrep;
0N/Aimport sun.net.idn.Punycode;
0N/Aimport sun.text.normalizer.UCharacterIterator;
0N/A
0N/A/**
0N/A * Provides methods to convert internationalized domain names (IDNs) between
0N/A * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
0N/A * Internationalized domain names can use characters from the entire range of
0N/A * Unicode, while traditional domain names are restricted to ASCII characters.
0N/A * ACE is an encoding of Unicode strings that uses only ASCII characters and
0N/A * can be used with software (such as the Domain Name System) that only
0N/A * understands traditional domain names.
0N/A *
0N/A * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
0N/A * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
0N/A * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
0N/A * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
0N/A * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
0N/A * domain name string back and forth.
0N/A *
0N/A * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
0N/A * <ul>
0N/A * <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
0N/A * can contain code points that are unassigned in Unicode 3.2, which is the
0N/A * Unicode version on which IDN conversion is based. If the flag is not used,
0N/A * the presence of such unassigned code points is treated as an error.
0N/A * <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
0N/A * It is an error if they don't meet the requirements.
0N/A * </ul>
0N/A * These flags can be logically OR'ed together.
0N/A *
0N/A * <p>The security consideration is important with respect to internationalization
0N/A * domain name support. For example, English domain names may be <i>homographed</i>
0N/A * - maliciously misspelled by substitution of non-Latin letters.
0N/A * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
0N/A * discusses security issues of IDN support as well as possible solutions.
0N/A * Applications are responsible for taking adequate security measures when using
0N/A * international domain names.
0N/A *
0N/A * @author Edward Wang
0N/A * @since 1.6
0N/A *
0N/A */
0N/Apublic final class IDN {
0N/A /**
0N/A * Flag to allow processing of unassigned code points
0N/A */
0N/A public static final int ALLOW_UNASSIGNED = 0x01;
0N/A
0N/A /**
0N/A * Flag to turn on the check against STD-3 ASCII rules
0N/A */
0N/A public static final int USE_STD3_ASCII_RULES = 0x02;
0N/A
0N/A
0N/A /**
0N/A * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
0N/A * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
0N/A *
0N/A * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
0N/A * If ToASCII operation fails, an IllegalArgumentException will be thrown.
0N/A * In this case, the input string should not be used in an internationalized domain name.
0N/A *
0N/A * <p> A label is an individual part of a domain name. The original ToASCII operation,
0N/A * as defined in RFC 3490, only operates on a single label. This method can handle
0N/A * both label and entire domain name, by assuming that labels in a domain name are
0N/A * always separated by dots. The following characters are recognized as dots:
0N/A * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
0N/A * and &#0092;uFF61 (halfwidth ideographic full stop). if dots are
0N/A * used as label separators, this method also changes all of them to &#0092;u002E (full stop)
0N/A * in output translated string.
0N/A *
0N/A * @param input the string to be processed
0N/A * @param flag process flag; can be 0 or any logical OR of possible flags
0N/A *
0N/A * @return the translated <tt>String</tt>
0N/A *
0N/A * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
0N/A */
0N/A public static String toASCII(String input, int flag)
0N/A {
0N/A int p = 0, q = 0;
0N/A StringBuffer out = new StringBuffer();
0N/A
0N/A while (p < input.length()) {
0N/A q = searchDots(input, p);
0N/A out.append(toASCIIInternal(input.substring(p, q), flag));
0N/A p = q + 1;
0N/A if (p < input.length()) out.append('.');
0N/A }
0N/A
0N/A return out.toString();
0N/A }
0N/A
0N/A
0N/A /**
0N/A * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
0N/A * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
0N/A *
0N/A * <p> This convenience method works as if by invoking the
0N/A * two-argument counterpart as follows:
0N/A * <blockquote><tt>
0N/A * {@link #toASCII(String, int) toASCII}(input,&nbsp;0);
0N/A * </tt></blockquote>
0N/A *
0N/A * @param input the string to be processed
0N/A *
0N/A * @return the translated <tt>String</tt>
0N/A *
0N/A * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
0N/A */
0N/A public static String toASCII(String input) {
0N/A return toASCII(input, 0);
0N/A }
0N/A
0N/A
0N/A /**
0N/A * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
0N/A * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
0N/A *
0N/A * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
0N/A *
0N/A * <p> A label is an individual part of a domain name. The original ToUnicode operation,
0N/A * as defined in RFC 3490, only operates on a single label. This method can handle
0N/A * both label and entire domain name, by assuming that labels in a domain name are
0N/A * always separated by dots. The following characters are recognized as dots:
0N/A * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
0N/A * and &#0092;uFF61 (halfwidth ideographic full stop).
0N/A *
0N/A * @param input the string to be processed
0N/A * @param flag process flag; can be 0 or any logical OR of possible flags
0N/A *
0N/A * @return the translated <tt>String</tt>
0N/A */
0N/A public static String toUnicode(String input, int flag) {
0N/A int p = 0, q = 0;
0N/A StringBuffer out = new StringBuffer();
0N/A
0N/A while (p < input.length()) {
0N/A q = searchDots(input, p);
0N/A out.append(toUnicodeInternal(input.substring(p, q), flag));
0N/A p = q + 1;
0N/A if (p < input.length()) out.append('.');
0N/A }
0N/A
0N/A return out.toString();
0N/A }
0N/A
0N/A
0N/A /**
0N/A * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
0N/A * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
0N/A *
0N/A * <p> This convenience method works as if by invoking the
0N/A * two-argument counterpart as follows:
0N/A * <blockquote><tt>
0N/A * {@link #toUnicode(String, int) toUnicode}(input,&nbsp;0);
0N/A * </tt></blockquote>
0N/A *
0N/A * @param input the string to be processed
0N/A *
0N/A * @return the translated <tt>String</tt>
0N/A */
0N/A public static String toUnicode(String input) {
0N/A return toUnicode(input, 0);
0N/A }
0N/A
0N/A
0N/A /* ---------------- Private members -------------- */
0N/A
0N/A // ACE Prefix is "xn--"
0N/A private static final String ACE_PREFIX = "xn--";
0N/A private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
0N/A
0N/A private static final int MAX_LABEL_LENGTH = 63;
0N/A
0N/A // single instance of nameprep
0N/A private static StringPrep namePrep = null;
0N/A
0N/A static {
0N/A InputStream stream = null;
0N/A
0N/A try {
0N/A final String IDN_PROFILE = "uidna.spp";
0N/A if (System.getSecurityManager() != null) {
0N/A stream = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
0N/A public InputStream run() {
0N/A return StringPrep.class.getResourceAsStream(IDN_PROFILE);
0N/A }
0N/A });
0N/A } else {
0N/A stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);
0N/A }
0N/A
0N/A namePrep = new StringPrep(stream);
0N/A stream.close();
0N/A } catch (IOException e) {
0N/A // should never reach here
0N/A assert false;
0N/A }
0N/A }
0N/A
0N/A
0N/A /* ---------------- Private operations -------------- */
0N/A
0N/A
0N/A //
0N/A // to suppress the default zero-argument constructor
0N/A //
0N/A private IDN() {}
0N/A
0N/A //
0N/A // toASCII operation; should only apply to a single label
0N/A //
0N/A private static String toASCIIInternal(String label, int flag)
0N/A {
0N/A // step 1
0N/A // Check if the string contains code points outside the ASCII range 0..0x7c.
0N/A boolean isASCII = isAllASCII(label);
0N/A StringBuffer dest;
0N/A
0N/A // step 2
0N/A // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
0N/A if (!isASCII) {
0N/A UCharacterIterator iter = UCharacterIterator.getInstance(label);
0N/A try {
0N/A dest = namePrep.prepare(iter, flag);
0N/A } catch (java.text.ParseException e) {
0N/A throw new IllegalArgumentException(e);
0N/A }
0N/A } else {
0N/A dest = new StringBuffer(label);
0N/A }
0N/A
0N/A // step 3
0N/A // Verify the absence of non-LDH ASCII code points
0N/A // 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
0N/A // Verify the absence of leading and trailing hyphen
0N/A boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
0N/A if (useSTD3ASCIIRules) {
0N/A for (int i = 0; i < dest.length(); i++) {
0N/A int c = dest.charAt(i);
0N/A if (!isLDHChar(c)) {
0N/A throw new IllegalArgumentException("Contains non-LDH characters");
0N/A }
0N/A }
0N/A
0N/A if (dest.charAt(0) == '-' || dest.charAt(dest.length() - 1) == '-') {
0N/A throw new IllegalArgumentException("Has leading or trailing hyphen");
0N/A }
0N/A }
0N/A
0N/A if (!isASCII) {
0N/A // step 4
0N/A // If all code points are inside 0..0x7f, skip to step 8
0N/A if (!isAllASCII(dest.toString())) {
0N/A // step 5
0N/A // verify the sequence does not begin with ACE prefix
0N/A if(!startsWithACEPrefix(dest)){
0N/A
0N/A // step 6
0N/A // encode the sequence with punycode
0N/A try {
0N/A dest = Punycode.encode(dest, null);
0N/A } catch (java.text.ParseException e) {
0N/A throw new IllegalArgumentException(e);
0N/A }
0N/A
0N/A dest = toASCIILower(dest);
0N/A
0N/A // step 7
0N/A // prepend the ACE prefix
0N/A dest.insert(0, ACE_PREFIX);
0N/A } else {
0N/A throw new IllegalArgumentException("The input starts with the ACE Prefix");
0N/A }
0N/A
0N/A }
0N/A }
0N/A
0N/A // step 8
0N/A // the length must be inside 1..63
0N/A if(dest.length() > MAX_LABEL_LENGTH){
0N/A throw new IllegalArgumentException("The label in the input is too long");
0N/A }
0N/A
0N/A return dest.toString();
0N/A }
0N/A
0N/A //
0N/A // toUnicode operation; should only apply to a single label
0N/A //
0N/A private static String toUnicodeInternal(String label, int flag) {
0N/A boolean[] caseFlags = null;
0N/A StringBuffer dest;
0N/A
0N/A // step 1
0N/A // find out if all the codepoints in input are ASCII
0N/A boolean isASCII = isAllASCII(label);
0N/A
0N/A if(!isASCII){
0N/A // step 2
0N/A // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
0N/A try {
0N/A UCharacterIterator iter = UCharacterIterator.getInstance(label);
0N/A dest = namePrep.prepare(iter, flag);
0N/A } catch (Exception e) {
0N/A // toUnicode never fails; if any step fails, return the input string
0N/A return label;
0N/A }
0N/A } else {
0N/A dest = new StringBuffer(label);
0N/A }
0N/A
0N/A // step 3
0N/A // verify ACE Prefix
0N/A if(startsWithACEPrefix(dest)) {
0N/A
0N/A // step 4
0N/A // Remove the ACE Prefix
0N/A String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());
0N/A
0N/A try {
0N/A // step 5
0N/A // Decode using punycode
0N/A StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);
0N/A
0N/A // step 6
0N/A // Apply toASCII
0N/A String toASCIIOut = toASCII(decodeOut.toString(), flag);
0N/A
0N/A // step 7
0N/A // verify
0N/A if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
0N/A // step 8
0N/A // return output of step 5
0N/A return decodeOut.toString();
0N/A }
0N/A } catch (Exception ignored) {
0N/A // no-op
0N/A }
0N/A }
0N/A
0N/A // just return the input
0N/A return label;
0N/A }
0N/A
0N/A
0N/A //
0N/A // LDH stands for "letter/digit/hyphen", with characters restricted to the
0N/A // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
0N/A // <->
0N/A // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x56..0x60, 0x7B..0x7F
0N/A //
0N/A private static boolean isLDHChar(int ch){
0N/A // high runner case
0N/A if(ch > 0x007A){
0N/A return false;
0N/A }
0N/A //['-' '0'..'9' 'A'..'Z' 'a'..'z']
0N/A if((ch == 0x002D) ||
0N/A (0x0030 <= ch && ch <= 0x0039) ||
0N/A (0x0041 <= ch && ch <= 0x005A) ||
0N/A (0x0061 <= ch && ch <= 0x007A)
0N/A ){
0N/A return true;
0N/A }
0N/A return false;
0N/A }
0N/A
0N/A
0N/A //
0N/A // search dots in a string and return the index of that character;
0N/A // or if there is no dots, return the length of input string
0N/A // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
0N/A // and \uFF61 (halfwidth ideographic full stop).
0N/A //
0N/A private static int searchDots(String s, int start) {
0N/A int i;
0N/A for (i = start; i < s.length(); i++) {
0N/A char c = s.charAt(i);
0N/A if (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61') {
0N/A break;
0N/A }
0N/A }
0N/A
0N/A return i;
0N/A }
0N/A
0N/A
0N/A //
0N/A // to check if a string only contains US-ASCII code point
0N/A //
0N/A private static boolean isAllASCII(String input) {
0N/A boolean isASCII = true;
0N/A for (int i = 0; i < input.length(); i++) {
0N/A int c = input.charAt(i);
0N/A if (c > 0x7F) {
0N/A isASCII = false;
0N/A break;
0N/A }
0N/A }
0N/A return isASCII;
0N/A }
0N/A
0N/A //
0N/A // to check if a string starts with ACE-prefix
0N/A //
0N/A private static boolean startsWithACEPrefix(StringBuffer input){
0N/A boolean startsWithPrefix = true;
0N/A
0N/A if(input.length() < ACE_PREFIX_LENGTH){
0N/A return false;
0N/A }
0N/A for(int i = 0; i < ACE_PREFIX_LENGTH; i++){
0N/A if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){
0N/A startsWithPrefix = false;
0N/A }
0N/A }
0N/A return startsWithPrefix;
0N/A }
0N/A
0N/A private static char toASCIILower(char ch){
0N/A if('A' <= ch && ch <= 'Z'){
0N/A return (char)(ch + 'a' - 'A');
0N/A }
0N/A return ch;
0N/A }
0N/A
0N/A private static StringBuffer toASCIILower(StringBuffer input){
0N/A StringBuffer dest = new StringBuffer();
0N/A for(int i = 0; i < input.length();i++){
0N/A dest.append(toASCIILower(input.charAt(i)));
0N/A }
0N/A return dest;
0N/A }
0N/A}