/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * See LICENSE.txt included in this distribution for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at LICENSE.txt.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

package org.opensolaris.opengrok.analysis;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;

import org.apache.lucene.document.Document;

/**
 * A {@link FileAnalyzer}, which tries to determine and use the charset of the 
 * input to analyze wrt. UTF-16 and UTF-8. If the charset cannot be detected,
 * the platforms default charset will be used.
 * 
 * @author 	Chris Eldredge
 * @version	$Revision$
 */
public abstract class TextAnalyzer extends FileAnalyzer {
    
    /**
     * Create a new instance.
     * @param factory   factory to use for fallback settings.
     */
    public TextAnalyzer(FileAnalyzerFactory factory) {
        super(factory);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public final void analyze(Document doc, InputStream in) throws IOException {
        String charset = null;

        in.mark(3);

        byte[] head = new byte[3];
        int br = in.read(head, 0, 3);

        if (br >= 2 &&
                (head[0] == (byte)0xFE && head[1] == (byte)0xFF) ||
                (head[0] == (byte)0xFF && head[1] == (byte)0xFE)) {
            charset = "UTF-16";
            in.reset();
        } else if (br >= 3 && head[0] == (byte)0xEF && head[1] == (byte)0xBB &&
                head[2] == (byte)0xBF) {
            // InputStreamReader does not properly discard BOM on UTF8 streams,
            // so don't reset the stream.
            charset = "UTF-8";
        }

        if (charset == null) {
            in.reset();
            charset = Charset.defaultCharset().name();
        }

        analyze(doc, new InputStreamReader(in, charset));
    }

    /**
     * Analyze the source provided by the given reader and store results into
     * the given document. NOTE: This method does not close the given reader
     * on return!
     * 
     * @param doc   where to store results
     * @param reader    reader which provides the source to analyze.
     * @throws IOException
     */
    protected abstract void analyze(Document doc, Reader reader) throws IOException;
}