TextAnalyzer.java revision 971
98N/A/*
98N/A * CDDL HEADER START
98N/A *
98N/A * The contents of this file are subject to the terms of the
98N/A * Common Development and Distribution License (the "License").
98N/A * You may not use this file except in compliance with the License.
98N/A *
98N/A * See LICENSE.txt included in this distribution for the specific
98N/A * language governing permissions and limitations under the License.
98N/A *
98N/A * When distributing Covered Code, include this CDDL HEADER in each
98N/A * file and include the License file at LICENSE.txt.
98N/A * If applicable, add the following below this CDDL HEADER, with the
98N/A * fields enclosed by brackets "[]" replaced with your own identifying
98N/A * information: Portions Copyright [yyyy] [name of copyright owner]
98N/A *
98N/A * CDDL HEADER END
98N/A */
98N/A
98N/Apackage org.opensolaris.opengrok.analysis;
98N/A
98N/Aimport java.io.IOException;
98N/Aimport java.io.InputStream;
98N/Aimport java.io.InputStreamReader;
98N/Aimport java.io.Reader;
98N/Aimport java.nio.charset.Charset;
98N/A
98N/Aimport org.apache.lucene.document.Document;
194N/A
194N/Apublic abstract class TextAnalyzer extends FileAnalyzer {
194N/A public TextAnalyzer(FileAnalyzerFactory factory) {
194N/A super(factory);
98N/A }
98N/A
98N/A public final void analyze(Document doc, InputStream in) throws IOException {
98N/A String charset = null;
98N/A
98N/A in.mark(3);
98N/A
194N/A byte[] head = new byte[3];
194N/A int br = in.read(head, 0, 3);
194N/A
98N/A if (br >= 2 &&
98N/A (head[0] == (byte)0xFE && head[1] == (byte)0xFF) ||
98N/A (head[0] == (byte)0xFF && head[1] == (byte)0xFE)) {
98N/A charset = "UTF-16";
98N/A in.reset();
191N/A } else if (br >= 3 && head[0] == (byte)0xEF && head[1] == (byte)0xBB &&
98N/A head[2] == (byte)0xBF) {
98N/A // InputStreamReader does not properly discard BOM on UTF8 streams,
98N/A // so don't reset the stream.
194N/A charset = "UTF-8";
98N/A }
98N/A
98N/A if (charset == null) {
98N/A in.reset();
98N/A charset = Charset.defaultCharset().name();
98N/A }
98N/A
194N/A analyze(doc, new InputStreamReader(in, charset));
194N/A }
194N/A
98N/A protected abstract void analyze(Document doc, Reader reader) throws IOException;
98N/A}
98N/A