/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* See LICENSE.txt included in this distribution for the specific
* language governing permissions and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at LICENSE.txt.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
package org.opensolaris.opengrok.analysis;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import org.apache.lucene.document.Document;
/**
* A {@link FileAnalyzer}, which tries to determine and use the charset of the
* input to analyze wrt. UTF-16 and UTF-8. If the charset cannot be detected,
* the platforms default charset will be used.
*
* @author Chris Eldredge
* @version $Revision$
*/
public abstract class TextAnalyzer extends FileAnalyzer {
/**
* Create a new instance.
* @param factory factory to use for fallback settings.
*/
public TextAnalyzer(FileAnalyzerFactory factory) {
super(factory);
}
/**
* {@inheritDoc}
*/
@Override
public final void analyze(Document doc, InputStream in) throws IOException {
String charset = null;
in.mark(3);
byte[] head = new byte[3];
int br = in.read(head, 0, 3);
if (br >= 2 &&
(head[0] == (byte)0xFE && head[1] == (byte)0xFF) ||
(head[0] == (byte)0xFF && head[1] == (byte)0xFE)) {
charset = "UTF-16";
in.reset();
} else if (br >= 3 && head[0] == (byte)0xEF && head[1] == (byte)0xBB &&
head[2] == (byte)0xBF) {
// InputStreamReader does not properly discard BOM on UTF8 streams,
// so don't reset the stream.
charset = "UTF-8";
}
if (charset == null) {
in.reset();
charset = Charset.defaultCharset().name();
}
analyze(doc, new InputStreamReader(in, charset));
}
/**
* Analyze the source provided by the given reader and store results into
* the given document. NOTE: This method does not close the given reader
* on return!
*
* @param doc where to store results
* @param reader reader which provides the source to analyze.
* @throws IOException
*/
protected abstract void analyze(Document doc, Reader reader) throws IOException;
}