54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco/*
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * CDDL HEADER START
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco *
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * The contents of this file are subject to the terms of the
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * Common Development and Distribution License (the "License").
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * You may not use this file except in compliance with the License.
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco *
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * See LICENSE.txt included in this distribution for the specific
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * language governing permissions and limitations under the License.
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco *
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * When distributing Covered Code, include this CDDL HEADER in each
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * file and include the License file at LICENSE.txt.
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * If applicable, add the following below this CDDL HEADER, with the
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * fields enclosed by brackets "[]" replaced with your own identifying
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * information: Portions Copyright [yyyy] [name of copyright owner]
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco *
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco * CDDL HEADER END
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco */
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco/*
eb1776903fd1f998009e97470a65fba8a499a0d9Lubos Kosco * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
54ba62a2c6e74332ffc742cb23faf21615b5d39fLubos Kosco */
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgepackage org.opensolaris.opengrok.analysis;
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredge
b4a94e1e9bfb77dcba635f9e3cfd4fd4276b64ccLubos Koscoimport static org.junit.Assert.assertEquals;
b4a94e1e9bfb77dcba635f9e3cfd4fd4276b64ccLubos Kosco
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgeimport java.io.ByteArrayInputStream;
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgeimport java.io.IOException;
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlenimport java.io.InputStream;
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgeimport java.io.InputStreamReader;
ceff76208c0e42569aaf7308a501ad38ea2f62a1Knut Anders Hatlenimport java.io.Reader;
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlenimport java.io.Writer;
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgeimport java.nio.ByteBuffer;
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgeimport java.nio.charset.Charset;
b4a94e1e9bfb77dcba635f9e3cfd4fd4276b64ccLubos Kosco
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgeimport org.apache.lucene.document.Document;
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgeimport org.junit.Test;
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredge
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredgepublic class TextAnalyzerTest {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
eb1776903fd1f998009e97470a65fba8a499a0d9Lubos Kosco private final String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen private String encoding;
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen private String contents;
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen private static StreamSource getStreamSource(final byte[] bytes) {
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen return new StreamSource() {
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen @Override
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen public InputStream getStream() throws IOException {
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen return new ByteArrayInputStream(bytes);
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen }
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen };
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen }
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen @Test
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen public void defaultEncoding() throws IOException {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen new TestableTextAnalyzer().analyze(new Document(),
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen getStreamSource("hello".getBytes()), null);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals(defaultEncoding, encoding);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals("hello", contents);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen @Test
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen public void resetsStreamOnShortInput() throws IOException {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen new TestableTextAnalyzer().analyze(new Document(),
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen getStreamSource("hi".getBytes()), null);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals(defaultEncoding, encoding);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals("hi", contents);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen @Test
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen public void utf8WithBOM() throws IOException {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen byte[] buffer = new byte[]{(byte) 239, (byte) 187, (byte) 191, 'h', 'e', 'l', 'l', 'o'};
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen new TestableTextAnalyzer().analyze(new Document(),
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen getStreamSource(buffer), null);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals("hello", contents);
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals("UTF8", encoding);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen @Test
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen public void utf16WithBOM() throws IOException {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen final ByteBuffer utf16str = Charset.forName("UTF-16").encode("hello");
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen byte[] bytes = new byte[utf16str.remaining()];
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen utf16str.get(bytes, 0, bytes.length);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen new TestableTextAnalyzer().analyze(new Document(),
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen getStreamSource(bytes), null);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals("UTF-16", encoding);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals("hello", contents);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen @Test
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen public void utf16WithBOMAlternate() throws IOException {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen final ByteBuffer utf16str = Charset.forName("UTF-16").encode("hello");
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen byte[] bytes = new byte[utf16str.remaining()];
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen utf16str.get(bytes, 0, bytes.length);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen for (int i = 0; i < bytes.length; i += 2) {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen byte b = bytes[i];
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen bytes[i] = bytes[i + 1];
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen bytes[i + 1] = b;
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen new TestableTextAnalyzer().analyze(new Document(),
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen getStreamSource(bytes), null);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals("UTF-16", encoding);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
1a2218353383d8cc9d1c01c95ad0a5fe94685f12Vladimir Kotal assertEquals("hello", contents);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen public class TestableTextAnalyzer extends TextAnalyzer {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen public TestableTextAnalyzer() {
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen super(null);
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen @Override
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen try (Reader r = getReader(src.getStream())) {
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen encoding = ((InputStreamReader) r).getEncoding();
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen StringBuilder sb = new StringBuilder();
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen int c;
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen while ((c = r.read()) != -1) {
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen sb.append((char) c);
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen }
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen contents = sb.toString();
a39bcfe2e58183496eab6572675e2896e5045fa7Knut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3733e9d930124c0050f9d5f27ee7a2f1b1b0bb2eKnut Anders Hatlen }
3155e2f2ec2ffa6e5e98f61f2deb990078ac9881Chris Eldredge}