opengrok/analysis/TextAnalyzer.java

	TextAnalyzer.java revision 1190
961N/A/*
961N/A * CDDL HEADER START
961N/A *
961N/A * The contents of this file are subject to the terms of the
961N/A * Common Development and Distribution License (the "License").
961N/A * You may not use this file except in compliance with the License.
961N/A *
961N/A * See LICENSE.txt included in this distribution for the specific
961N/A * language governing permissions and limitations under the License.
961N/A *
961N/A * When distributing Covered Code, include this CDDL HEADER in each
961N/A * file and include the License file at LICENSE.txt.
961N/A * If applicable, add the following below this CDDL HEADER, with the
961N/A * fields enclosed by brackets "[]" replaced with your own identifying
961N/A * information: Portions Copyright [yyyy] [name of copyright owner]
961N/A *
961N/A * CDDL HEADER END
961N/A */
961N/A
953N/Apackage org.opensolaris.opengrok.analysis;
953N/A
953N/Aimport java.io.IOException;
953N/Aimport java.io.InputStream;
953N/Aimport java.io.InputStreamReader;
957N/Aimport java.io.Reader;
953N/Aimport java.nio.charset.Charset;
953N/A
953N/Aimport org.apache.lucene.document.Document;
953N/A
953N/Apublic abstract class TextAnalyzer extends FileAnalyzer {
972N/A    public TextAnalyzer(FileAnalyzerFactory factory) {
972N/A        super(factory);
972N/A    }
953N/A
953N/A    public final void analyze(Document doc, InputStream in) throws IOException {
972N/A        String charset = null;
972N/A
972N/A        in.mark(3);
972N/A
972N/A        byte[] head = new byte[3];
972N/A        int br = in.read(head, 0, 3);
953N/A
970N/A        if (br >= 2 &&
970N/A                (head[0] == (byte)0xFE && head[1] == (byte)0xFF) ||
970N/A                (head[0] == (byte)0xFF && head[1] == (byte)0xFE)) {
970N/A            charset = "UTF-16";
970N/A            in.reset();
971N/A        } else if (br >= 3 && head[0] == (byte)0xEF && head[1] == (byte)0xBB &&
970N/A                head[2] == (byte)0xBF) {
970N/A            // InputStreamReader does not properly discard BOM on UTF8 streams,
970N/A            // so don't reset the stream.
970N/A            charset = "UTF-8";
970N/A        }
970N/A
972N/A        if (charset == null) {
972N/A            in.reset();
972N/A            charset = Charset.defaultCharset().name();
972N/A        }
972N/A
953N/A        analyze(doc, new InputStreamReader(in, charset));
953N/A    }
1190N/A
957N/A    protected abstract void analyze(Document doc, Reader reader) throws IOException;
953N/A}