JavaSymbolTokenizer.lex revision 2b46c6c34fcb824c622e7da13c019de88e8474a7
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* See LICENSE.txt included in this distribution for the specific
* language governing permissions and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at LICENSE.txt.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Gets Java symbols - ignores comments, strings, keywords
*/
package org.opensolaris.opengrok.analysis.java;
import java.util.*;
import java.io.*;
import org.apache.lucene.analysis.*;
%%
%public
%class JavaSymbolTokenizer
%extends Tokenizer
%unicode
%function next
%type Token
%{
public void close() {
}
public void reInit(char[] buf, int len) {
yyreset((Reader) null);
zzBuffer = buf;
zzEndRead = len;
zzAtEOF = true;
zzStartRead = 0;
}
public static void main(String argv[]) {
if (argv.length == 0) {
System.out.println("Usage : java JavaSymbolTokenizer <inputfiles>");
}
else {
Date start = new Date();
for (String arg: argv) {
JavaSymbolTokenizer scanner = null;
try {
scanner = new JavaSymbolTokenizer( new BufferedReader(new java.io.FileReader(arg)));
Token t;
while ((t = scanner.next()) != null) {
System.out.println(t.termText() + " ["+t.startOffset()+"-"+ t.endOffset()+"]");
}
}
catch (Exception e) {
e.printStackTrace();
}
long span = ((new Date()).getTime() - start.getTime());
System.err.println("took: "+ span + " msec");
}
}
}
%}
Identifier = [a-zA-Z_] [a-zA-Z0-9_]*
%state STRING COMMENT SCOMMENT QSTRING
%%
<YYINITIAL> {
{Identifier} {String id = yytext();
if(!Consts.kwd.contains(id))
return new Token(yytext(), zzStartRead, zzMarkedPos);}
\" { yybegin(STRING); }
\' { yybegin(QSTRING); }
"/*" { yybegin(COMMENT); }
"//" { yybegin(SCOMMENT); }
}
<STRING> {
\" { yybegin(YYINITIAL); }
\\\\ | \\\" {}
}
<QSTRING> {
\' { yybegin(YYINITIAL); }
}
<COMMENT> {
"*/" { yybegin(YYINITIAL);}
}
<SCOMMENT> {
\n { yybegin(YYINITIAL);}
}
<YYINITIAL, STRING, COMMENT, SCOMMENT, QSTRING> {
<<EOF>> { return null;}
.|\n {}
}