cs/ext/EUC_TW.java

/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package sun.nio.cs.ext;

import java.io.*;
import java.nio.CharBuffer;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.util.Arrays;
import sun.nio.cs.HistoricallyNamedCharset;
import static sun.nio.cs.CharsetMapping.*;

public class EUC_TW extends Charset implements HistoricallyNamedCharset
{
    private static final int SS2 = 0x8E;

    /*
       (1) EUC_TW
       Second byte of EUC_TW for cs2 is in range of
       0xA1-0xB0 for plane 1-16. According to CJKV /163,
       plane1 is coded in both cs1 and cs2. This impl
       however does not decode the codepoints of plane1
       in cs2, so only p2-p7 and p15 are supported in cs2.

       Plane2  0xA2;
       Plane3  0xA3;
       Plane4  0xA4;
       Plane5  0xA5;
       Plane6  0xA6;
       Plane7  0xA7;
       Plane15 0xAF;

       (2) Mapping
       The fact that all supplementary characters encoded in EUC_TW are
       in 0x2xxxx range gives us the room to optimize the data tables.

       Decoding:
       (1) save the lower 16-bit value of all codepoints of b->c mapping
           in a String array table  String[plane] b2c.
       (2) save "codepoint is supplementary" info (one bit) in a
           byte[] b2cIsSupp, so 8 codepoints (same codepoint value, different
           plane No) share one byte.

       Encoding:
       (1)c->b mappings are stored in
          char[]c2b/char[]c2bIndex
          char[]c2bSupp/char[]c2bIndexsupp  (indexed by lower 16-bit
       (2)byte[] c2bPlane stores the "plane info" of each euc-tw codepoints,
          BMP and Supp share the low/high 4 bits of one byte.

       Mapping tables are stored separated in EUC_TWMapping, which
       is generated by tool.
     */

    public EUC_TW() {
        super("x-EUC-TW", ExtendedCharsets.aliasesFor("x-EUC-TW"));
    }

    public String historicalName() {
        return "EUC_TW";
    }

    public boolean contains(Charset cs) {
        return ((cs.name().equals("US-ASCII"))
                || (cs instanceof EUC_TW));
    }

    public CharsetDecoder newDecoder() {
        return new Decoder(this);
    }

    public CharsetEncoder newEncoder() {
        return new Encoder(this);
    }

    public static class Decoder extends CharsetDecoder {
        public Decoder(Charset cs) {
            super(cs, 2.0f, 2.0f);
        }

        char[] c1 = new char[1];
        char[] c2 = new char[2];
        public char[] toUnicode(int b1, int b2, int p) {
            return decode(b1, b2, p, c1, c2);
        }

        static final String[] b2c =  EUC_TWMapping.b2c;
        static final int b1Min    =  EUC_TWMapping.b1Min;
        static final int b1Max    =  EUC_TWMapping.b1Max;
        static final int b2Min    =  EUC_TWMapping.b2Min;
        static final int b2Max    =  EUC_TWMapping.b2Max;
        static final int dbSegSize = b2Max - b2Min + 1;
        static final byte[] b2cIsSupp;

        // adjust from cns planeNo to the plane index of b2c
        static final byte[] cnspToIndex = new byte[0x100];
        static {
            Arrays.fill(cnspToIndex, (byte)-1);
            cnspToIndex[0xa2] = 1; cnspToIndex[0xa3] = 2; cnspToIndex[0xa4] = 3;
            cnspToIndex[0xa5] = 4; cnspToIndex[0xa6] = 5; cnspToIndex[0xa7] = 6;
            cnspToIndex[0xaf] = 7;
        }

        //static final BitSet b2cIsSupp;
        static {
            String b2cIsSuppStr = EUC_TWMapping.b2cIsSuppStr;
            // work on a local copy is much faster than operate
            // directly on b2cIsSupp
            byte[] flag = new byte[b2cIsSuppStr.length() << 1];
            int off = 0;
            for (int i = 0; i < b2cIsSuppStr.length(); i++) {
                char c = b2cIsSuppStr.charAt(i);
                flag[off++] = (byte)(c >> 8);
                flag[off++] = (byte)(c & 0xff);
            }
            b2cIsSupp = flag;
        }

        static boolean isLegalDB(int b) {
           return b >= b1Min && b <= b1Max;
        }

        static char[] decode(int b1, int b2, int p, char[] c1, char[] c2)
        {
            if (b1 < b1Min || b1 > b1Max || b2 < b2Min || b2 > b2Max)
                return null;
            int index = (b1 - b1Min) * dbSegSize + b2 - b2Min;
            char c = b2c[p].charAt(index);
            if (c == UNMAPPABLE_DECODING)
                return null;
            if ((b2cIsSupp[index] & (1 << p)) == 0) {
                c1[0] = c;
                return c1;
            } else {
                c2[0] = Character.highSurrogate(0x20000 + c);
                c2[1] = Character.lowSurrogate(0x20000 + c);
                return c2;
            }
        }

        private CoderResult decodeArrayLoop(ByteBuffer src,
                                            CharBuffer dst)
        {
            byte[] sa = src.array();
            int sp = src.arrayOffset() + src.position();
            int sl = src.arrayOffset() + src.limit();

            char[] da = dst.array();
            int dp = dst.arrayOffset() + dst.position();
            int dl = dst.arrayOffset() + dst.limit();
            try {
                while (sp < sl) {
                    int byte1 = sa[sp] & 0xff;
                    if (byte1 == SS2) { // Codeset 2  G2
                        if ( sl - sp < 4)
                            return CoderResult.UNDERFLOW;
                        int cnsPlane = cnspToIndex[sa[sp + 1] & 0xff];
                        if (cnsPlane < 0)
                            return CoderResult.malformedForLength(2);
                        byte1 = sa[sp + 2] & 0xff;
                        int byte2 = sa[sp + 3] & 0xff;
                        char[] cc = toUnicode(byte1, byte2, cnsPlane);
                        if (cc == null) {
                            if (!isLegalDB(byte1) || !isLegalDB(byte2))
                                return CoderResult.malformedForLength(4);
                            return CoderResult.unmappableForLength(4);
                        }
                        if (dl - dp < cc.length)
                            return CoderResult.OVERFLOW;
                        if (cc.length == 1) {
                            da[dp++] = cc[0];
                        } else {
                            da[dp++] = cc[0];
                            da[dp++] = cc[1];
                        }
                        sp += 4;
                    } else if (byte1 < 0x80) {  // ASCII      G0
                        if (dl - dp < 1)
                           return CoderResult.OVERFLOW;
                        da[dp++] = (char) byte1;
                        sp++;
                    } else {                    // Codeset 1  G1
                        if ( sl - sp < 2)
                            return CoderResult.UNDERFLOW;
                        int byte2 = sa[sp + 1] & 0xff;
                        char[] cc = toUnicode(byte1, byte2, 0);
                        if (cc == null) {
                            if (!isLegalDB(byte1) || !isLegalDB(byte2))
                                return CoderResult.malformedForLength(1);
                            return CoderResult.unmappableForLength(2);
                        }
                        if (dl - dp < 1)
                            return CoderResult.OVERFLOW;
                        da[dp++] = cc[0];
                        sp += 2;
                    }
                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(sp - src.arrayOffset());
                dst.position(dp - dst.arrayOffset());
            }
        }

        private CoderResult decodeBufferLoop(ByteBuffer src,
                                             CharBuffer dst)
        {
            int mark = src.position();
            try {
                while (src.hasRemaining()) {
                    int byte1 = src.get() & 0xff;
                    if (byte1 == SS2) {            // Codeset 2  G2
                        if ( src.remaining() < 3)
                            return CoderResult.UNDERFLOW;
                        int cnsPlane = cnspToIndex[src.get() & 0xff];
                        if (cnsPlane < 0)
                            return CoderResult.malformedForLength(2);
                        byte1 = src.get() & 0xff;
                        int byte2 = src.get() & 0xff;
                        char[] cc = toUnicode(byte1, byte2, cnsPlane);
                        if (cc == null) {
                            if (!isLegalDB(byte1) || !isLegalDB(byte2))
                                return CoderResult.malformedForLength(4);
                            return CoderResult.unmappableForLength(4);
                        }
                        if (dst.remaining() < cc.length)
                            return CoderResult.OVERFLOW;
                        if (cc.length == 1) {
                            dst.put(cc[0]);
                        } else {
                            dst.put(cc[0]);
                            dst.put(cc[1]);
                        }
                        mark += 4;
                    } else if (byte1 < 0x80) {        // ASCII      G0
                        if (!dst.hasRemaining())
                           return CoderResult.OVERFLOW;
                        dst.put((char) byte1);
                        mark++;
                    } else {                          // Codeset 1  G1
                        if (!src.hasRemaining())
                            return CoderResult.UNDERFLOW;
                        int byte2 = src.get() & 0xff;
                        char[] cc = toUnicode(byte1, byte2, 0);
                        if (cc == null) {
                            if (!isLegalDB(byte1) || !isLegalDB(byte2))
                                return CoderResult.malformedForLength(1);
                            return CoderResult.unmappableForLength(2);
                        }
                        if (!dst.hasRemaining())
                            return CoderResult.OVERFLOW;
                        dst.put(cc[0]);
                        mark +=2;
                    }
               }
               return CoderResult.UNDERFLOW;
            } finally {
                src.position(mark);
            }
        }

        protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst)
        {
            if (src.hasArray() && dst.hasArray())
                return decodeArrayLoop(src, dst);
            else
                return decodeBufferLoop(src, dst);
        }
    }

    public static class Encoder extends CharsetEncoder {
        private byte[] bb = new byte[4];

        public Encoder(Charset cs) {
            super(cs, 4.0f, 4.0f);
        }

        public boolean canEncode(char c) {
            return (c <= '\u007f' || toEUC(c, bb) != -1);
        }

        public boolean canEncode(CharSequence cs) {
            int i = 0;
            while (i < cs.length()) {
                char c = cs.charAt(i++);
                if (Character.isHighSurrogate(c)) {
                    if (i == cs.length())
                        return false;
                    char low = cs.charAt(i++);
                    if (!Character.isLowSurrogate(low) || toEUC(c, low, bb) == -1)
                        return false;
                } else if (!canEncode(c)) {
                    return false;
                }
            }
            return true;
        }

        public int toEUC(char hi, char low, byte[] bb) {
            return encode(hi, low, bb);
        }

        public int toEUC(char c, byte[] bb) {
            return encode(c, bb);
        }

        private CoderResult encodeArrayLoop(CharBuffer src,
                                            ByteBuffer dst)
        {
            char[] sa = src.array();
            int sp = src.arrayOffset() + src.position();
            int sl = src.arrayOffset() + src.limit();

            byte[] da = dst.array();
            int dp = dst.arrayOffset() + dst.position();
            int dl = dst.arrayOffset() + dst.limit();

            int inSize;
            int outSize;

            try {
                while (sp < sl) {
                    char c = sa[sp];
                    inSize = 1;
                    if (c < 0x80) {  // ASCII
                        bb[0] = (byte)c;
                        outSize = 1;
                    } else {
                        outSize = toEUC(c, bb);
                        if (outSize == -1) {
                            // to check surrogates only after BMP failed
                            // has the benefit of improving the BMP encoding
                            // 10% faster, with the price of the slowdown of
                            // supplementary character encoding. given the use
                            // of supplementary characters is really rare, this
                            // is something worth doing.
                            if (Character.isHighSurrogate(c)) {
                                if ((sp + 1) == sl)
                                    return CoderResult.UNDERFLOW;
                                if (!Character.isLowSurrogate(sa[sp + 1]))
                                    return CoderResult.malformedForLength(1);
                                outSize = toEUC(c, sa[sp+1], bb);
                                    inSize = 2;
                            } else if (Character.isLowSurrogate(c)) {
                                return CoderResult.malformedForLength(1);
                            }
                        }
                    }
                    if (outSize == -1)
                        return CoderResult.unmappableForLength(inSize);
                    if ( dl - dp < outSize)
                        return CoderResult.OVERFLOW;
                    for (int i = 0; i < outSize; i++)
                        da[dp++] = bb[i];
                    sp  += inSize;
                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(sp - src.arrayOffset());
                dst.position(dp - dst.arrayOffset());
            }
        }

        private CoderResult encodeBufferLoop(CharBuffer src,
                                             ByteBuffer dst)
        {
            int outSize;
            int inSize;
            int mark = src.position();

            try {
                while (src.hasRemaining()) {
                    inSize = 1;
                    char c = src.get();
                    if (c < 0x80) {   // ASCII
                        outSize = 1;
                        bb[0] = (byte)c;
                    } else {
                        outSize = toEUC(c, bb);
                        if (outSize == -1) {
                            if (Character.isHighSurrogate(c)) {
                                if (!src.hasRemaining())
                                    return CoderResult.UNDERFLOW;
                                char c2 = src.get();
                                if (!Character.isLowSurrogate(c2))
                                    return CoderResult.malformedForLength(1);
                                outSize = toEUC(c, c2, bb);
                                inSize = 2;
                            } else if (Character.isLowSurrogate(c)) {
                                return CoderResult.malformedForLength(1);
                            }
                        }
                    }
                    if (outSize == -1)
                        return CoderResult.unmappableForLength(inSize);
                    if (dst.remaining() < outSize)
                        return CoderResult.OVERFLOW;
                    for (int i = 0; i < outSize; i++)
                        dst.put(bb[i]);
                    mark += inSize;
                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(mark);
            }
        }

        protected CoderResult encodeLoop(CharBuffer src, ByteBuffer dst)
        {
            if (src.hasArray() && dst.hasArray())
                return encodeArrayLoop(src, dst);
            else
                return encodeBufferLoop(src, dst);
        }

        static int encode(char hi, char low, byte[] bb) {
            int c = Character.toCodePoint(hi, low);
            if ((c & 0xf0000) != 0x20000)
                return -1;
            c -= 0x20000;
            int index = c2bSuppIndex[c >> 8];
            if (index  == UNMAPPABLE_ENCODING)
                return -1;
            index = index + (c & 0xff);
            int db = c2bSupp[index];
            if (db == UNMAPPABLE_ENCODING)
                return -1;
            int p = (c2bPlane[index] >> 4) & 0xf;
            bb[0] = (byte)SS2;
            bb[1] = (byte)(0xa0 | p);
            bb[2] = (byte)(db >> 8);
            bb[3] = (byte)db;
            return 4;
        }

        static int encode(char c, byte[] bb) {
            int index = c2bIndex[c >> 8];
            if (index  == UNMAPPABLE_ENCODING)
                return -1;
            index = index + (c & 0xff);
            int db = c2b[index];
            if (db == UNMAPPABLE_ENCODING)
                return -1;
            int p = c2bPlane[index] & 0xf;
            if (p == 0) {
                bb[0] = (byte)(db >> 8);
                bb[1] = (byte)db;
                return 2;
            } else {
                bb[0] = (byte)SS2;
                bb[1] = (byte)(0xa0 | p);
                bb[2] = (byte)(db >> 8);
                bb[3] = (byte)db;
                return 4;
            }
        }

        static final char[] c2b;
        static final char[] c2bIndex;
        static final char[] c2bSupp;
        static final char[] c2bSuppIndex;
        static final byte[] c2bPlane;
        static {
            int b1Min    =  Decoder.b1Min;
            int b1Max    =  Decoder.b1Max;
            int b2Min    =  Decoder.b2Min;
            int b2Max    =  Decoder.b2Max;
            int dbSegSize = Decoder.dbSegSize;
            String[] b2c = Decoder.b2c;
            byte[] b2cIsSupp = Decoder.b2cIsSupp;

            c2bIndex = EUC_TWMapping.c2bIndex;
            c2bSuppIndex = EUC_TWMapping.c2bSuppIndex;
            char[] c2b0 = new char[EUC_TWMapping.C2BSIZE];
            char[] c2bSupp0 = new char[EUC_TWMapping.C2BSUPPSIZE];
            byte[] c2bPlane0 = new byte[Math.max(EUC_TWMapping.C2BSIZE,
                                                 EUC_TWMapping.C2BSUPPSIZE)];

            Arrays.fill(c2b0, (char)UNMAPPABLE_ENCODING);
            Arrays.fill(c2bSupp0, (char)UNMAPPABLE_ENCODING);

            for (int p = 0; p < b2c.length; p++) {
                String db = b2c[p];
                /*
                   adjust the "plane" from 0..7 to 0, 2, 3, 4, 5, 6, 7, 0xf,
                   which helps balance between footprint (to save the plane
                   info in 4 bits) and runtime performance (to require only
                   one operation "0xa0 | plane" to encode the plane byte)
                */
                int plane = p;
                if (plane == 7)
                    plane = 0xf;
                else if (plane != 0)
                    plane = p + 1;

                int off = 0;
                for (int b1 = b1Min; b1 <= b1Max; b1++) {
                    for (int b2 = b2Min; b2 <= b2Max; b2++) {
                        char c = db.charAt(off);
                        if (c != UNMAPPABLE_DECODING) {
                            if ((b2cIsSupp[off] & (1 << p)) != 0) {
                                int index = c2bSuppIndex[c >> 8] + (c&0xff);
                                c2bSupp0[index] = (char)((b1 << 8) + b2);
                                c2bPlane0[index] |= (byte)(plane << 4);
                            } else {
                                int index = c2bIndex[c >> 8] + (c&0xff);
                                c2b0[index] = (char)((b1 << 8) + b2);
                                c2bPlane0[index] |= (byte)plane;
                            }
                        }
                        off++;
                    }
                }
            }
            c2b = c2b0;
            c2bSupp = c2bSupp0;
            c2bPlane = c2bPlane0;
        }
    }
}