cs/ext/COMPOUND_TEXT_Encoder.java

/*
 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
package sun.nio.cs.ext;

import java.io.ByteArrayOutputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;

import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

public class COMPOUND_TEXT_Encoder extends CharsetEncoder {

    /**
     * NOTE: The following four static variables should be used *only* for
     * testing whether a encoder can encode a specific character. They
     * cannot be used for actual encoding because they are shared across all
     * COMPOUND_TEXT encoders and may be stateful.
     */
    private static final Map<String,CharsetEncoder> encodingToEncoderMap =
      Collections.synchronizedMap(new HashMap<String,CharsetEncoder>(21, 1.0f));
    private static final CharsetEncoder latin1Encoder;
    private static final CharsetEncoder defaultEncoder;
    private static final boolean defaultEncodingSupported;

    static {
        CharsetEncoder encoder = Charset.defaultCharset().newEncoder();
        String encoding = encoder.charset().name();
        if ("ISO8859_1".equals(encoding)) {
            latin1Encoder = encoder;
            defaultEncoder = encoder;
            defaultEncodingSupported = true;
        } else {
            try {
                latin1Encoder =
                    Charset.forName("ISO8859_1").newEncoder();
            } catch (IllegalArgumentException e) {
                throw new ExceptionInInitializerError
                    ("ISO8859_1 unsupported");
            }
            defaultEncoder = encoder;
            defaultEncodingSupported = CompoundTextSupport.getEncodings().
                contains(defaultEncoder.charset().name());
        }
    }

    private CharsetEncoder encoder;
    private char[] charBuf = new char[1];
    private CharBuffer charbuf = CharBuffer.wrap(charBuf);
    private ByteArrayOutputStream nonStandardCharsetBuffer;
    private byte[] byteBuf;
    private ByteBuffer bytebuf;
    private int numNonStandardChars, nonStandardEncodingLen;

    public COMPOUND_TEXT_Encoder(Charset cs) {
        super(cs,
              (float)(CompoundTextSupport.MAX_CONTROL_SEQUENCE_LEN + 2),
              (float)(CompoundTextSupport.MAX_CONTROL_SEQUENCE_LEN + 2));
        try {
            encoder = Charset.forName("ISO8859_1").newEncoder();
        } catch (IllegalArgumentException cannotHappen) {}
        initEncoder(encoder);
    }

    protected CoderResult encodeLoop(CharBuffer src, ByteBuffer des) {
        CoderResult cr = CoderResult.UNDERFLOW;
        char[] input = src.array();
        int inOff = src.arrayOffset() + src.position();
        int inEnd = src.arrayOffset() + src.limit();

        try {
            while (inOff < inEnd && cr.isUnderflow()) {
                charBuf[0] = input[inOff];
                if (charBuf[0] <= '\u0008' ||
                    (charBuf[0] >= '\u000B' && charBuf[0] <= '\u001F') ||
                    (charBuf[0] >= '\u0080' && charBuf[0] <= '\u009F')) {
                    // The compound text specification only permits the octets
                    // 0x09, 0x0A, 0x1B, and 0x9B in C0 and C1. Of these, 1B and
                    // 9B must also be removed because they initiate control
                    // sequences.
                    charBuf[0] = '?';
                }

                CharsetEncoder enc = getEncoder(charBuf[0]);
                //System.out.println("char=" + charBuf[0] + ", enc=" + enc);
                if (enc == null) {
                    if (unmappableCharacterAction()
                        == CodingErrorAction.REPORT) {
                        charBuf[0] = '?';
                        enc = latin1Encoder;
                    } else {
                        return CoderResult.unmappableForLength(1);
                    }
                }
                if (enc != encoder) {
                    if (nonStandardCharsetBuffer != null) {
                        cr = flushNonStandardCharsetBuffer(des);
                    } else {
                        //cr= encoder.flush(des);
                        flushEncoder(encoder, des);
                    }
                    if (!cr.isUnderflow())
                        return cr;
                    byte[] escSequence = CompoundTextSupport.
                        getEscapeSequence(enc.charset().name());
                    if (escSequence == null) {
                        throw new InternalError("Unknown encoding: " +
                                                enc.charset().name());
                    } else if (escSequence[1] == (byte)0x25 &&
                               escSequence[2] == (byte)0x2F) {
                        initNonStandardCharsetBuffer(enc, escSequence);
                    } else if (des.remaining() >= escSequence.length) {
                        des.put(escSequence, 0, escSequence.length);
                    } else {
                        return CoderResult.OVERFLOW;
                    }
                    encoder = enc;
                    continue;
                }
                charbuf.rewind();
                if (nonStandardCharsetBuffer == null) {
                    cr = encoder.encode(charbuf, des, false);
                } else {
                    bytebuf.clear();
                    cr = encoder.encode(charbuf, bytebuf, false);
                    bytebuf.flip();
                    nonStandardCharsetBuffer.write(byteBuf,
                                                   0, bytebuf.limit());
                    numNonStandardChars++;
                }
                inOff++;
            }
            return cr;
        } finally {
            src.position(inOff - src.arrayOffset());
        }
    }

    protected CoderResult implFlush(ByteBuffer out) {
        CoderResult cr = (nonStandardCharsetBuffer != null)
            ? flushNonStandardCharsetBuffer(out)
            //: encoder.flush(out);
            : flushEncoder(encoder, out);
        reset();
        return cr;
    }

    private void initNonStandardCharsetBuffer(CharsetEncoder c,
                                              byte[] escSequence)
    {
        nonStandardCharsetBuffer = new ByteArrayOutputStream();
        byteBuf = new byte[(int)c.maxBytesPerChar()];
        bytebuf = ByteBuffer.wrap(byteBuf);
        nonStandardCharsetBuffer.write(escSequence, 0, escSequence.length);
        nonStandardCharsetBuffer.write(0); // M placeholder
        nonStandardCharsetBuffer.write(0); // L placeholder
        byte[] encoding = CompoundTextSupport.
            getEncoding(c.charset().name());
        if (encoding == null) {
            throw new InternalError
                ("Unknown encoding: " + encoder.charset().name());
        }
        nonStandardCharsetBuffer.write(encoding, 0, encoding.length);
        nonStandardCharsetBuffer.write(0x02); // divider
        nonStandardEncodingLen = encoding.length + 1;
    }

    private CoderResult flushNonStandardCharsetBuffer(ByteBuffer out) {
        if (numNonStandardChars > 0) {
            byte[] flushBuf = new byte[(int)encoder.maxBytesPerChar() *
                                       numNonStandardChars];
            ByteBuffer bb = ByteBuffer.wrap(flushBuf);
            flushEncoder(encoder, bb);
            bb.flip();
            nonStandardCharsetBuffer.write(flushBuf, 0, bb.limit());
            numNonStandardChars = 0;
        }

        int numBytes = nonStandardCharsetBuffer.size();
        int nonStandardBytesOff = 6 + nonStandardEncodingLen;

        if (out.remaining() < (numBytes - nonStandardBytesOff) +
            nonStandardBytesOff * (((numBytes - nonStandardBytesOff) /
                                    ((1 << 14) - 1)) + 1))
        {
            return CoderResult.OVERFLOW;
        }

        byte[] nonStandardBytes =
            nonStandardCharsetBuffer.toByteArray();

        // The non-standard charset header only supports 2^14-1 bytes of data.
        // If we have more than that, we have to repeat the header.
        do {
            out.put((byte)0x1B);
            out.put((byte)0x25);
            out.put((byte)0x2F);
            out.put(nonStandardBytes[3]);

            int toWrite = Math.min(numBytes - nonStandardBytesOff,
                                   (1 << 14) - 1 - nonStandardEncodingLen);

            out.put((byte)
                (((toWrite + nonStandardEncodingLen) / 0x80) | 0x80)); // M
            out.put((byte)
                (((toWrite + nonStandardEncodingLen) % 0x80) | 0x80)); // L
            out.put(nonStandardBytes, 6, nonStandardEncodingLen);
            out.put(nonStandardBytes, nonStandardBytesOff, toWrite);
            nonStandardBytesOff += toWrite;
        } while (nonStandardBytesOff < numBytes);

        nonStandardCharsetBuffer = null;
        byteBuf = null;
        nonStandardEncodingLen = 0;
        return CoderResult.UNDERFLOW;
    }

    /**
     * Resets the encoder.
     * Call this method to reset the encoder to its initial state
     */
    protected void implReset() {
        numNonStandardChars = nonStandardEncodingLen = 0;
        nonStandardCharsetBuffer = null;
        byteBuf = null;
        try {
            encoder = Charset.forName("ISO8859_1").newEncoder();
        } catch (IllegalArgumentException cannotHappen) {
        }
        initEncoder(encoder);
    }

    /**
     * Return whether a character is mappable or not
     * @return true if a character is mappable
     */
    public boolean canEncode(char ch) {
        return getEncoder(ch) != null;
    }

    protected void implOnMalformedInput(CodingErrorAction newAction) {
        encoder.onUnmappableCharacter(newAction);
    }

    protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
        encoder.onUnmappableCharacter(newAction);
    }

    protected void implReplaceWith(byte[] newReplacement) {
        if (encoder != null)
            encoder.replaceWith(newReplacement);
    }

    /**
     * Try to figure out which CharsetEncoder to use for conversion
     * of the specified Unicode character. The target character encoding
     * of the returned encoder is approved to be used with Compound Text.
     *
     * @param ch Unicode character
     * @return CharsetEncoder to convert the given character
     */
    private CharsetEncoder getEncoder(char ch) {
        // 1. Try the current encoder.
        if (encoder.canEncode(ch)) {
            return encoder;
        }

        // 2. Try the default encoder.
        if (defaultEncodingSupported && defaultEncoder.canEncode(ch)) {
            CharsetEncoder retval = null;
            try {
                retval = defaultEncoder.charset().newEncoder();
            } catch (UnsupportedOperationException cannotHappen) {
            }
            initEncoder(retval);
            return retval;
        }

        // 3. Try ISO8859-1.
        if (latin1Encoder.canEncode(ch)) {
            CharsetEncoder retval = null;
            try {
                retval = latin1Encoder.charset().newEncoder();
            } catch (UnsupportedOperationException cannotHappen) {}
            initEncoder(retval);
            return retval;
        }

        // 4. Brute force search of all supported encodings.
        for (String encoding : CompoundTextSupport.getEncodings())
        {
            CharsetEncoder enc = encodingToEncoderMap.get(encoding);
            if (enc == null) {
                enc = CompoundTextSupport.getEncoder(encoding);
                if (enc == null) {
                    throw new InternalError("Unsupported encoding: " +
                                            encoding);
                }
                encodingToEncoderMap.put(encoding, enc);
            }
            if (enc.canEncode(ch)) {
                CharsetEncoder retval = CompoundTextSupport.getEncoder(encoding);
                initEncoder(retval);
                return retval;
            }
        }

        return null;
    }

    private void initEncoder(CharsetEncoder enc) {
        try {
            enc.onUnmappableCharacter(CodingErrorAction.REPLACE)
                .replaceWith(replacement());
        } catch (IllegalArgumentException x) {}
    }

    private CharBuffer fcb= CharBuffer.allocate(0);
    private CoderResult flushEncoder(CharsetEncoder enc, ByteBuffer bb) {
        enc.encode(fcb, bb, true);
        return enc.flush(bb);
    }
}