TextAnalyzerTest.java revision a39bcfe2e58183496eab6572675e2896e5045fa7
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* See LICENSE.txt included in this distribution for the specific
* language governing permissions and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at LICENSE.txt.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
*/
package org.opensolaris.opengrok.analysis;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import junit.framework.Assert;
import org.apache.lucene.document.Document;
import org.junit.Test;
public class TextAnalyzerTest {
private String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
private String encoding;
private String contents;
private static StreamSource getStreamSource(final byte[] bytes) {
return new StreamSource() {
@Override
public InputStream getStream() throws IOException {
return new ByteArrayInputStream(bytes);
}
};
}
@Test
public void defaultEncoding() throws IOException {
new TestableTextAnalyzer().analyze(new Document(),
getStreamSource("hello".getBytes()), null);
Assert.assertEquals(defaultEncoding, encoding);
Assert.assertEquals("hello", contents);
}
@Test
public void resetsStreamOnShortInput() throws IOException {
new TestableTextAnalyzer().analyze(new Document(),
getStreamSource("hi".getBytes()), null);
Assert.assertEquals(defaultEncoding, encoding);
Assert.assertEquals("hi", contents);
}
@Test
public void utf8WithBOM() throws IOException {
byte[] buffer = new byte[]{(byte) 239, (byte) 187, (byte) 191, 'h', 'e', 'l', 'l', 'o'};
new TestableTextAnalyzer().analyze(new Document(),
getStreamSource(buffer), null);
Assert.assertEquals("hello", contents);
Assert.assertEquals("UTF8", encoding);
}
@Test
public void utf16WithBOM() throws IOException {
final ByteBuffer utf16str = Charset.forName("UTF-16").encode("hello");
byte[] bytes = new byte[utf16str.remaining()];
utf16str.get(bytes, 0, bytes.length);
new TestableTextAnalyzer().analyze(new Document(),
getStreamSource(bytes), null);
Assert.assertEquals("UTF-16", encoding);
Assert.assertEquals("hello", contents);
}
@Test
public void utf16WithBOMAlternate() throws IOException {
final ByteBuffer utf16str = Charset.forName("UTF-16").encode("hello");
byte[] bytes = new byte[utf16str.remaining()];
utf16str.get(bytes, 0, bytes.length);
for (int i = 0; i < bytes.length; i += 2) {
byte b = bytes[i];
bytes[i] = bytes[i + 1];
bytes[i + 1] = b;
}
new TestableTextAnalyzer().analyze(new Document(),
getStreamSource(bytes), null);
Assert.assertEquals("UTF-16", encoding);
Assert.assertEquals("hello", contents);
}
public class TestableTextAnalyzer extends TextAnalyzer {
public TestableTextAnalyzer() {
super(null);
}
@Override
public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
try (Reader r = getReader(src.getStream())) {
encoding = ((InputStreamReader) r).getEncoding();
StringBuilder sb = new StringBuilder();
int c;
while ((c = r.read()) != -1) {
sb.append((char) c);
}
contents = sb.toString();
}
}
}
}