opengrok/analysis/TextAnalyzer.java

	TextAnalyzer.java revision 953
953N/Apackage org.opensolaris.opengrok.analysis;
953N/A
953N/Aimport java.io.IOException;
953N/Aimport java.io.InputStream;
953N/Aimport java.io.InputStreamReader;
953N/Aimport java.nio.charset.Charset;
953N/A
953N/Aimport org.apache.lucene.document.Document;
953N/A
953N/Apublic abstract class TextAnalyzer extends FileAnalyzer {
953N/A    public TextAnalyzer(FileAnalyzerFactory factory) {
953N/A        super(factory);
953N/A    }
953N/A
953N/A    public final void analyze(Document doc, InputStream in) throws IOException {
953N/A        String charset = null;
953N/A
953N/A        in.mark(3);
953N/A
953N/A        byte[] head = new byte[3];
953N/A        int br = in.read(head, 0, 3);
953N/A
953N/A        if (br >= 2) {
953N/A            if ((head[0] == (byte)0xFE && head[1] == (byte)0xFF) || (head[0] == (byte)0xFF && (byte)head[1] == (byte)0xFE)) {
953N/A                charset = "UTF16";
953N/A                in.reset();
953N/A            }
953N/A        }
953N/A        if (br >= 3) {
953N/A            if (head[0] == (byte)0xEF && head[1] == (byte)0xBB && head[2] == (byte)0xBF) {
953N/A                /* InputStreamReader does not properly discard BOM on UTF8 streams,
953N/A                 * so don't reset the stream. */
953N/A                charset = "UTF8";
953N/A            }
953N/A        }
953N/A
953N/A        if (charset == null) {
953N/A            in.reset();
953N/A            charset = Charset.defaultCharset().name();
953N/A        }
953N/A
953N/A        analyze(doc, new InputStreamReader(in, charset));
953N/A    }
953N/A
953N/A    protected abstract void analyze(Document doc, InputStreamReader reader) throws IOException;
953N/A}