opengrok/analysis/JFlexTokenizerTest.java

	JFlexTokenizerTest.java revision 1006
1004N/A/*
1004N/A * CDDL HEADER START
1004N/A *
1004N/A * The contents of this file are subject to the terms of the
1004N/A * Common Development and Distribution License (the "License").
1004N/A * You may not use this file except in compliance with the License.
1004N/A *
1004N/A * See LICENSE.txt included in this distribution for the specific
1004N/A * language governing permissions and limitations under the License.
1004N/A *
1004N/A * When distributing Covered Code, include this CDDL HEADER in each
1004N/A * file and include the License file at LICENSE.txt.
1004N/A * If applicable, add the following below this CDDL HEADER, with the
1004N/A * fields enclosed by brackets "[]" replaced with your own identifying
1004N/A * information: Portions Copyright [yyyy] [name of copyright owner]
1004N/A *
1004N/A * CDDL HEADER END
1004N/A */
1004N/A
1004N/A/*
1004N/A * Copyright 2010 Sun Micosystems.  All rights reserved.
1004N/A * Use is subject to license terms.
1004N/A */
1004N/A
1004N/Apackage org.opensolaris.opengrok.analysis;
1004N/A
1004N/Aimport java.io.Reader;
1004N/Aimport java.io.StringReader;
1004N/Aimport org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
1004N/Aimport org.apache.lucene.analysis.tokenattributes.TermAttribute;
1004N/Aimport org.junit.Test;
1004N/Aimport org.opensolaris.opengrok.analysis.c.CSymbolTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.c.CxxSymbolTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.document.TroffFullTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.fortran.FortranSymbolTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.java.JavaSymbolTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.lisp.LispSymbolTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.plain.PlainFullTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.plain.PlainSymbolTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.sh.ShSymbolTokenizer;
1004N/Aimport org.opensolaris.opengrok.analysis.tcl.TclSymbolTokenizer;
1004N/Aimport static org.junit.Assert.*;
1004N/A
1004N/A/**
1004N/A * Unit tests for JFlexTokenizer.
1004N/A */
1004N/Apublic class JFlexTokenizerTest {
1004N/A
1004N/A    /**
1004N/A     * Test that the various sub-classes of JFlexTokenizerTest return the
1004N/A     * correct offsets for the tokens. They used to give wrong values for
1004N/A     * the last token. Bug #15858.
1004N/A     */
1004N/A    @Test
1004N/A    public void testOffsetAttribute() throws Exception {
1004N/A        testOffsetAttribute(ShSymbolTokenizer.class);
1004N/A        testOffsetAttribute(TroffFullTokenizer.class);
1004N/A        testOffsetAttribute(PlainSymbolTokenizer.class);
1004N/A        testOffsetAttribute(PlainFullTokenizer.class);
1004N/A        testOffsetAttribute(CSymbolTokenizer.class);
1004N/A        testOffsetAttribute(CxxSymbolTokenizer.class);
1004N/A        testOffsetAttribute(JavaSymbolTokenizer.class);
1004N/A        testOffsetAttribute(LispSymbolTokenizer.class);
1004N/A        testOffsetAttribute(TclSymbolTokenizer.class);
1004N/A
1004N/A        // The Fortran tokenizer doesn't accept the default input text, so
1004N/A        // create a text fragment that it understands
1004N/A        testOffsetAttribute(FortranSymbolTokenizer.class,
1004N/A                "1 token1 = token2 + token3",
1004N/A                new String[] {"token1", "token2", "token3"});
1004N/A    }
1004N/A
1004N/A    /**
1004N/A     * Helper method for {@link #testOffsetAttribute()} that runs the test
1004N/A     * on one single implementation class.
1004N/A     */
1004N/A    private void testOffsetAttribute(Class <? extends JFlexTokenizer> klass)
1004N/A            throws Exception {
1004N/A        String inputText = "alpha beta gamma delta";
1004N/A        String[] expectedTokens = inputText.split(" ");
1004N/A        testOffsetAttribute(klass, inputText, expectedTokens);
1004N/A    }
1004N/A
1004N/A    /**
1004N/A     * Helper method for {@link #testOffsetAttribute()} that runs the test
1004N/A     * on one single implementation class with the specified input text and
1004N/A     * expected tokens.
1004N/A     */
1004N/A    private void testOffsetAttribute(Class <? extends JFlexTokenizer> klass,
1004N/A                                     String inputText, String[] expectedTokens)
1004N/A            throws Exception {
1004N/A        JFlexTokenizer tokenizer = klass.getConstructor(Reader.class)
1004N/A                .newInstance(new StringReader(inputText));
1004N/A
1004N/A        TermAttribute term = tokenizer.addAttribute(TermAttribute.class);
1004N/A        OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
1004N/A
1004N/A        int count = 0;
1004N/A        while (tokenizer.incrementToken()) {
1004N/A            assertTrue("too many tokens", count < expectedTokens.length);
1004N/A            String expected = expectedTokens[count];
1004N/A            assertEquals("term", expected, term.term());
1004N/A            assertEquals("start",
1004N/A                    inputText.indexOf(expected), offset.startOffset());
1004N/A            assertEquals("end",
1004N/A                    inputText.indexOf(expected) + expected.length(),
1004N/A                    offset.endOffset());
1004N/A            count++;
1004N/A        }
1004N/A
1004N/A        assertEquals("wrong number of tokens", expectedTokens.length, count);
1004N/A    }
1006N/A
1006N/A    /**
1006N/A     * The fix for bug #15858 caused a regression in ShSymbolTokenizer where
1006N/A     * variables on the form {@code ${VARIABLE}} were not correctly indexed
1006N/A     * if they were inside a quoted string. The closing brace would be part of
1006N/A     * the indexed term in that case.
1006N/A     */
1006N/A    @Test
1006N/A    public void testShellVariableInBraces() throws Exception {
1006N/A        // Shell command to tokenize
1006N/A        String inputText = "echo \"${VARIABLE} $abc xyz\"";
1006N/A        // "echo" is an ignored token in ShSymbolTokenizer, "xyz" is a string
1006N/A        // and not a symbol. Therefore, expect just the two tokens that name
1006N/A        // variables.
1006N/A        String[] expectedTokens = {"VARIABLE", "abc"};
1006N/A        testOffsetAttribute(ShSymbolTokenizer.class, inputText, expectedTokens);
1006N/A    }
1004N/A}