/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* See LICENSE.txt included in this distribution for the specific
* language governing permissions and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at LICENSE.txt.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
*/
package org.opensolaris.opengrok.analysis;
import java.io.CharArrayReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayDeque;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* This class was created because of lucene:
* <ul>
* <li>2.4.1 update which introduced char[] in Tokens instead of String</li>
* <li>3.0.0 uses AttributeSource instead of Tokens to make things even easier :-D</li>
* <li>3.5.0 uses CharTermAttribute</li>
* </ul>
*
* Generally this is a "template" for all new Tokenizers, so be carefull when
* changing it, it will impact almost ALL symbol tokenizers in OpenGrok ...
*
* Created on August 24, 2009
* @author Lubos Kosco
*/
public abstract class JFlexTokenizer extends Tokenizer {
/** Stack to remember the order of relevant states for the current parser. */
protected ArrayDeque<Integer> stateStack = new ArrayDeque<Integer>();
/**
* Run the scanner to get the next token from the input.
* @return {@code true} if a new Token is available/was found.
* @throws IOException */
abstract public boolean yylex() throws IOException;
/**
* Closes the current input stream, and resets the scanner to read from the
* given input stream. All internal variables are reset, the old input
* stream cannot be reused (content of the internal buffer is discarded and
* lost). The lexical state is set to {@code YY_INITIAL}.
* @param reader the new input stream to operate on.*/
abstract public void yyreset(Reader reader);
/**
* Closes the input stream in use. All subsequent calls to the scanning
* method will return the end of file value.
* @throws IOException
*/
abstract public void yyclose() throws IOException;
/**
* Enter the given lexical state.
* @param newState state to enter
*/
abstract public void yybegin(int newState);
/**
* Get the current lexical state of the scanner.
* @return a lexical state.
*/
abstract public int yystate();
/**
* Create a new tokenizer using the given stream.
* @param input input to process. Might be {@code null}.
*/
public JFlexTokenizer(java.io.Reader input) {
super(input);
}
/**
* Reinitialize the tokenizer with new contents.
*
* @param contents a char buffer with text to tokenize
* @param length the number of characters to use from the char buffer
*/
public final void reInit(char[] contents, int length) {
yyreset(new CharArrayReader(contents, 0, length));
}
/**
* Close the scanner including the input stream in use.
* @see #yyclose()
*/
@Override
public final void close() throws IOException {
yyclose();
}
/** term text of the current Token */
protected CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** start and end character offset of the current Token */
protected OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/** position of the current Token relative to the previous Token within the
* related TokenStream */
protected PositionIncrementAttribute posIncrAtt =
addAttribute(PositionIncrementAttribute.class);
/**
* Go forward to the next available token.
* @return {@code false} if no more tokens available.
* @throws java.io.IOException
*/
@Override
public final boolean incrementToken() throws java.io.IOException {
return this.yylex();
}
/**
* Reset the attributes for the current Token.
* NOTE: For now PositionIncrement gets automatically set to {@code 1}.
* @param str Token text to set.
* @param start the start psition of the current Token
* @param end the end position of the current Token
* @see #termAtt
* @see #offsetAtt
* @see #posIncrAtt
*/
protected void setAttribs(String str, int start, int end) {
//FIXME increasing below by one(default) might be tricky, need more analysis
// after lucene upgrade to 3.5 below is most probably not even needed
this.posIncrAtt.setPositionIncrement(1);
this.termAtt.setEmpty();
this.termAtt.append(str);
this.offsetAtt.setOffset(start, end);
}
/**
* Push the current state to the state order stack and enter the given state.
* @param newState new state to enter.
* @see #stateStack
* @see #yystate()
* @see #yybegin(int)
*/
@SuppressWarnings("boxing")
public void yypush(int newState) {
stateStack.push(yystate());
yybegin(newState);
}
/**
* Pop the last entry from the state order stack and enter it.
* @see #stateStack
* @see #yybegin(int)
*/
@SuppressWarnings("boxing")
public void yypop() {
yybegin(stateStack.pop());
}
}