activation/registries/MailcapTokenizer.java

/*
 * Copyright (c) 1997, 2006, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.activation.registries;

/**
 *      A tokenizer for strings in the form of "foo/bar; prop1=val1; ... ".
 *      Useful for parsing MIME content types.
 */
public class MailcapTokenizer {

    public static final int UNKNOWN_TOKEN = 0;
    public static final int START_TOKEN = 1;
    public static final int STRING_TOKEN = 2;
    public static final int EOI_TOKEN = 5;
    public static final int SLASH_TOKEN = '/';
    public static final int SEMICOLON_TOKEN = ';';
    public static final int EQUALS_TOKEN = '=';

    /**
     *  Constructor
     *
     *  @parameter  inputString the string to tokenize
     */
    public MailcapTokenizer(String inputString) {
        data = inputString;
        dataIndex = 0;
        dataLength = inputString.length();

        currentToken = START_TOKEN;
        currentTokenValue = "";

        isAutoquoting = false;
        autoquoteChar = ';';
    }

    /**
     *  Set whether auto-quoting is on or off.
     *
     *  Auto-quoting means that all characters after the first
     *  non-whitespace, non-control character up to the auto-quote
     *  terminator character or EOI (minus any whitespace immediatley
     *  preceeding it) is considered a token.
     *
     *  This is required for handling command strings in a mailcap entry.
     */
    public void setIsAutoquoting(boolean value) {
        isAutoquoting = value;
    }

    /**
     *  Retrieve current token.
     *
     *  @returns    The current token value
     */
    public int getCurrentToken() {
        return currentToken;
    }

    /*
     *  Get a String that describes the given token.
     */
    public static String nameForToken(int token) {
        String name = "really unknown";

        switch(token) {
            case UNKNOWN_TOKEN:
                name = "unknown";
                break;
            case START_TOKEN:
                name = "start";
                break;
            case STRING_TOKEN:
                name = "string";
                break;
            case EOI_TOKEN:
                name = "EOI";
                break;
            case SLASH_TOKEN:
                name = "'/'";
                break;
            case SEMICOLON_TOKEN:
                name = "';'";
                break;
            case EQUALS_TOKEN:
                name = "'='";
                break;
        }

        return name;
    }

    /*
     *  Retrieve current token value.
     *
     *  @returns    A String containing the current token value
     */
    public String getCurrentTokenValue() {
        return currentTokenValue;
    }
    /*
     *  Process the next token.
     *
     *  @returns    the next token
     */
    public int nextToken() {
        if (dataIndex < dataLength) {
            //  skip white space
            while ((dataIndex < dataLength) &&
                    (isWhiteSpaceChar(data.charAt(dataIndex)))) {
                ++dataIndex;
            }

            if (dataIndex < dataLength) {
                //  examine the current character and see what kind of token we have
                char c = data.charAt(dataIndex);
                if (isAutoquoting) {
                    if (c == ';' || c == '=') {
                        currentToken = c;
                        currentTokenValue = new Character(c).toString();
                        ++dataIndex;
                    } else {
                        processAutoquoteToken();
                    }
                } else {
                    if (isStringTokenChar(c)) {
                        processStringToken();
                    } else if ((c == '/') || (c == ';') || (c == '=')) {
                        currentToken = c;
                        currentTokenValue = new Character(c).toString();
                        ++dataIndex;
                    } else {
                        currentToken = UNKNOWN_TOKEN;
                        currentTokenValue = new Character(c).toString();
                        ++dataIndex;
                    }
                }
            } else {
                currentToken = EOI_TOKEN;
                currentTokenValue = null;
            }
        } else {
            currentToken = EOI_TOKEN;
            currentTokenValue = null;
        }

        return currentToken;
    }

    private void processStringToken() {
        //  capture the initial index
        int initialIndex = dataIndex;

        //  skip to 1st non string token character
        while ((dataIndex < dataLength) &&
                isStringTokenChar(data.charAt(dataIndex))) {
            ++dataIndex;
        }

        currentToken = STRING_TOKEN;
        currentTokenValue = data.substring(initialIndex, dataIndex);
    }

    private void processAutoquoteToken() {
        //  capture the initial index
        int initialIndex = dataIndex;

        //  now skip to the 1st non-escaped autoquote termination character
        //  XXX - doesn't actually consider escaping
        boolean foundTerminator = false;
        while ((dataIndex < dataLength) && !foundTerminator) {
            char c = data.charAt(dataIndex);
            if (c != autoquoteChar) {
                ++dataIndex;
            } else {
                foundTerminator = true;
            }
        }

        currentToken = STRING_TOKEN;
        currentTokenValue =
            fixEscapeSequences(data.substring(initialIndex, dataIndex));
    }

    private static boolean isSpecialChar(char c) {
        boolean lAnswer = false;

        switch(c) {
            case '(':
            case ')':
            case '<':
            case '>':
            case '@':
            case ',':
            case ';':
            case ':':
            case '\\':
            case '"':
            case '/':
            case '[':
            case ']':
            case '?':
            case '=':
                lAnswer = true;
                break;
        }

        return lAnswer;
    }

    private static boolean isControlChar(char c) {
        return Character.isISOControl(c);
    }

    private static boolean isWhiteSpaceChar(char c) {
        return Character.isWhitespace(c);
    }

    private static boolean isStringTokenChar(char c) {
        return !isSpecialChar(c) && !isControlChar(c) && !isWhiteSpaceChar(c);
    }

    private static String fixEscapeSequences(String inputString) {
        int inputLength = inputString.length();
        StringBuffer buffer = new StringBuffer();
        buffer.ensureCapacity(inputLength);

        for (int i = 0; i < inputLength; ++i) {
            char currentChar = inputString.charAt(i);
            if (currentChar != '\\') {
                buffer.append(currentChar);
            } else {
                if (i < inputLength - 1) {
                    char nextChar = inputString.charAt(i + 1);
                    buffer.append(nextChar);

                    //  force a skip over the next character too
                    ++i;
                } else {
                    buffer.append(currentChar);
                }
            }
        }

        return buffer.toString();
    }

    private String  data;
    private int     dataIndex;
    private int     dataLength;
    private int     currentToken;
    private String  currentTokenValue;
    private boolean isAutoquoting;
    private char    autoquoteChar;

    /*
    public static void main(String[] args) {
        for (int i = 0; i < args.length; ++i) {
            MailcapTokenizer tokenizer = new MailcapTokenizer(args[i]);

            System.out.println("Original: |" + args[i] + "|");

            int currentToken = tokenizer.nextToken();
            while (currentToken != EOI_TOKEN) {
                switch(currentToken) {
                    case UNKNOWN_TOKEN:
                        System.out.println("  Unknown Token:           |" + tokenizer.getCurrentTokenValue() + "|");
                        break;
                    case START_TOKEN:
                        System.out.println("  Start Token:             |" + tokenizer.getCurrentTokenValue() + "|");
                        break;
                    case STRING_TOKEN:
                        System.out.println("  String Token:            |" + tokenizer.getCurrentTokenValue() + "|");
                        break;
                    case EOI_TOKEN:
                        System.out.println("  EOI Token:               |" + tokenizer.getCurrentTokenValue() + "|");
                        break;
                    case SLASH_TOKEN:
                        System.out.println("  Slash Token:             |" + tokenizer.getCurrentTokenValue() + "|");
                        break;
                    case SEMICOLON_TOKEN:
                        System.out.println("  Semicolon Token:         |" + tokenizer.getCurrentTokenValue() + "|");
                        break;
                    case EQUALS_TOKEN:
                        System.out.println("  Equals Token:            |" + tokenizer.getCurrentTokenValue() + "|");
                        break;
                    default:
                        System.out.println("  Really Unknown Token:    |" + tokenizer.getCurrentTokenValue() + "|");
                        break;
                }

                currentToken = tokenizer.nextToken();
            }

            System.out.println("");
        }
    }
    */
}