AnalyzerGuru.java revision 200
0N/A/*
0N/A * CDDL HEADER START
0N/A *
0N/A * The contents of this file are subject to the terms of the
0N/A * Common Development and Distribution License (the "License").
0N/A * You may not use this file except in compliance with the License.
0N/A *
0N/A * See LICENSE.txt included in this distribution for the specific
0N/A * language governing permissions and limitations under the License.
0N/A *
0N/A * When distributing Covered Code, include this CDDL HEADER in each
0N/A * file and include the License file at LICENSE.txt.
0N/A * If applicable, add the following below this CDDL HEADER, with the
0N/A * fields enclosed by brackets "[]" replaced with your own identifying
0N/A * information: Portions Copyright [yyyy] [name of copyright owner]
0N/A *
0N/A * CDDL HEADER END
0N/A */
0N/A
0N/A/*
143N/A * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
0N/A * Use is subject to license terms.
0N/A */
0N/Apackage org.opensolaris.opengrok.analysis;
0N/A
143N/Aimport java.io.BufferedInputStream;
143N/Aimport java.io.File;
143N/Aimport java.io.FileInputStream;
143N/Aimport java.io.IOException;
143N/Aimport java.io.InputStream;
143N/Aimport java.io.OutputStreamWriter;
143N/Aimport java.io.Reader;
143N/Aimport java.io.StringReader;
143N/Aimport java.io.Writer;
143N/Aimport java.lang.reflect.InvocationTargetException;
143N/Aimport java.lang.reflect.Method;
143N/Aimport java.util.ArrayList;
143N/Aimport java.util.HashMap;
143N/Aimport java.util.Iterator;
200N/Aimport java.util.Map;
143N/Aimport java.util.SortedMap;
143N/Aimport java.util.TreeMap;
143N/Aimport org.apache.lucene.analysis.Token;
143N/Aimport org.apache.lucene.analysis.TokenStream;
143N/Aimport org.apache.lucene.document.DateTools;
143N/Aimport org.apache.lucene.document.Document;
99N/Aimport org.apache.lucene.document.Field;
0N/Aimport org.opensolaris.opengrok.analysis.FileAnalyzer.Genre;
143N/Aimport org.opensolaris.opengrok.analysis.archive.BZip2Analyzer;
143N/Aimport org.opensolaris.opengrok.analysis.archive.GZIPAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.archive.TarAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.archive.ZipAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.c.CAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.data.IgnorantAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.data.ImageAnalyzer;
0N/Aimport org.opensolaris.opengrok.analysis.document.TroffAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.executables.ELFAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.executables.JarAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.executables.JavaClassAnalyzer;
3N/Aimport org.opensolaris.opengrok.analysis.java.JavaAnalyzer;
38N/Aimport org.opensolaris.opengrok.analysis.lisp.LispAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.plain.PlainAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.plain.XMLAnalyzer;
143N/Aimport org.opensolaris.opengrok.analysis.sh.ShAnalyzer;
190N/Aimport org.opensolaris.opengrok.analysis.sql.SQLAnalyzer;
58N/Aimport org.opensolaris.opengrok.configuration.Project;
143N/Aimport org.opensolaris.opengrok.history.Annotation;
143N/Aimport org.opensolaris.opengrok.history.HistoryGuru;
143N/Aimport org.opensolaris.opengrok.history.HistoryReader;
0N/Aimport org.opensolaris.opengrok.web.Util;
0N/A
0N/A/**
143N/A * Manages and porvides Analyzers as needed. Please see
143N/A * <a href="http://www.opensolaris.org/os/project/opengrok/manual/internals/">
143N/A * this</a> page for a great description of the purpose of the AnalyzerGuru.
143N/A *
0N/A * Created on September 22, 2005
0N/A * @author Chandan
0N/A */
0N/Apublic class AnalyzerGuru {
143N/A
36N/A private static HashMap<String, Class<? extends FileAnalyzer>> ext;
36N/A private static SortedMap<String, Class<? extends FileAnalyzer>> magics;
0N/A private static ArrayList<Method> matchers;
0N/A /*
0N/A * If you write your own analyzer please register it here
0N/A */
143N/A private static ArrayList<Class<? extends FileAnalyzer>> analyzers = new ArrayList<Class<? extends FileAnalyzer>>();
36N/A static {
36N/A analyzers.add(IgnorantAnalyzer.class);
36N/A analyzers.add(BZip2Analyzer.class);
36N/A analyzers.add(FileAnalyzer.class);
36N/A analyzers.add(XMLAnalyzer.class);
36N/A analyzers.add(TroffAnalyzer.class);
36N/A analyzers.add(ELFAnalyzer.class);
36N/A analyzers.add(JavaClassAnalyzer.class);
36N/A analyzers.add(ImageAnalyzer.class);
36N/A analyzers.add(JarAnalyzer.class);
36N/A analyzers.add(ZipAnalyzer.class);
36N/A analyzers.add(TarAnalyzer.class);
36N/A analyzers.add(CAnalyzer.class);
36N/A analyzers.add(ShAnalyzer.class);
36N/A analyzers.add(PlainAnalyzer.class);
36N/A analyzers.add(GZIPAnalyzer.class);
36N/A analyzers.add(JavaAnalyzer.class);
38N/A analyzers.add(LispAnalyzer.class);
190N/A analyzers.add(SQLAnalyzer.class);
36N/A }
143N/A private static HashMap<Class<? extends FileAnalyzer>, FileAnalyzer> analyzerInstances = new HashMap<Class<? extends FileAnalyzer>, FileAnalyzer>();
0N/A /**
0N/A * Initializes an AnalyzerGuru
0N/A */
0N/A static {
0N/A if (ext == null) {
36N/A ext = new HashMap<String, Class<? extends FileAnalyzer>>();
0N/A }
0N/A if (magics == null) {
36N/A magics = new TreeMap<String, Class<? extends FileAnalyzer>>();
0N/A // TODO: have a comparator
0N/A }
0N/A if (matchers == null) {
0N/A matchers = new ArrayList<Method>();
0N/A }
143N/A for (Class<? extends FileAnalyzer> analyzer : analyzers) {
143N/A try {
0N/A String[] suffixes = (String[]) analyzer.getField("suffixes").get(null);
143N/A for (String suffix : suffixes) {
0N/A //System.err.println(analyzer.getSimpleName() + " = " + suffix);
148N/A Class old = ext.put(suffix, analyzer);
148N/A assert old == null :
148N/A "suffix '" + suffix + "' used in multiple analyzers";
0N/A }
0N/A } catch (Exception e) {
0N/A // System.err.println("AnalyzerFinder:" + analyzer.getSimpleName() + e);
0N/A }
143N/A try {
0N/A String[] smagics = (String[]) analyzer.getField("magics").get(null);
143N/A for (String magic : smagics) {
0N/A //System.err.println(analyzer.getSimpleName() + " = " + magic);
0N/A magics.put(magic, analyzer);
0N/A }
0N/A } catch (Exception e) {
0N/A // System.err.println("AnalyzerFinder: " + analyzer.getSimpleName() + e);
0N/A }
143N/A try {
0N/A Method m = analyzer.getMethod("isMagic", byte[].class);
143N/A if (m != null) {
143N/A matchers.add(m);
143N/A }
0N/A } catch (Exception e) {
0N/A }
0N/A }
0N/A //System.err.println("Exts " + ext);
0N/A //System.err.println("Matchers " + matchers);
0N/A }
126N/A
143N/A /**
143N/A * Instruct the AnalyzerGuru to use a given analyzer for a given
143N/A * file extension.
143N/A * @param extension the file-extension to add
143N/A * @param analyzer the analyzer to use for the given extension
143N/A * (if you pass null as the analyzer, you will disable
143N/A * the analyzer used for that extension)
143N/A */
126N/A public static void addExtension(String extension, Class<? extends FileAnalyzer> analyzer) {
126N/A ext.remove(extension);
126N/A if (analyzer != null) {
126N/A ext.put(extension, analyzer);
126N/A }
126N/A }
143N/A
0N/A /*
0N/A * Get the default Analyzer.
0N/A */
0N/A public static FileAnalyzer getAnalyzer() {
143N/A
36N/A Class<FileAnalyzer> a = FileAnalyzer.class;
0N/A FileAnalyzer fa = analyzerInstances.get(a);
0N/A if (fa == null) {
0N/A try {
143N/A fa = a.newInstance();
0N/A analyzerInstances.put(a, fa);
0N/A return fa;
0N/A } catch (Exception e) {
0N/A System.err.println("ERROR: Initializing " + a);
0N/A }
0N/A }
0N/A return fa;
0N/A }
143N/A
143N/A /**
143N/A * Get an analyzer suited to analyze a file. This function will reuse
143N/A * analyzers since they are costly.
143N/A *
143N/A * @param in Input stream containing data to be analyzed
143N/A * @param file Name of the file to be analyzed
143N/A * @return An analyzer suited for that file content
143N/A * @throws java.io.IOException If an error occurs while accessing the
143N/A * data in the input stream.
0N/A */
143N/A public static FileAnalyzer getAnalyzer(InputStream in, String file) throws IOException {
143N/A Class<? extends FileAnalyzer> a = find(in, file);
143N/A if (a == null) {
0N/A a = FileAnalyzer.class;
0N/A }
0N/A if (a != null) {
0N/A FileAnalyzer fa = analyzerInstances.get(a);
0N/A if (fa == null) {
0N/A try {
0N/A fa = (FileAnalyzer) a.newInstance();
0N/A analyzerInstances.put(a, fa);
0N/A return fa;
0N/A } catch (Exception e) {
0N/A System.err.println("ERROR: Initializing " + a);
0N/A }
0N/A } else {
0N/A return fa;
0N/A }
0N/A }
0N/A return null;
0N/A }
143N/A
143N/A /**
143N/A * Create a Lucene document and fill in the required fields
143N/A * @param file The file to index
143N/A * @param in The data to generate the index for
143N/A * @param path Where the file is located (from source root)
143N/A * @return The Lucene document to add to the index database
143N/A * @throws java.io.IOException If an exception occurs while collecting the
143N/A * datas
143N/A */
143N/A public Document getDocument(File file, InputStream in, String path) throws IOException {
0N/A Document doc = new Document();
143N/A String date = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND);
99N/A doc.add(new Field("u", Util.uid(path, date), Field.Store.YES, Field.Index.UN_TOKENIZED));
143N/A doc.add(new Field("fullpath", file.getAbsolutePath(), Field.Store.YES, Field.Index.TOKENIZED));
143N/A
143N/A try {
143N/A HistoryReader hr = HistoryGuru.getInstance().getHistoryReader(file);
0N/A if (hr != null) {
99N/A doc.add(new Field("hist", hr));
0N/A // date = hr.getLastCommentDate() //RFE
0N/A }
0N/A } catch (IOException e) {
99N/A e.printStackTrace();
0N/A }
99N/A doc.add(new Field("date", date, Field.Store.YES, Field.Index.UN_TOKENIZED));
143N/A if (path != null) {
99N/A doc.add(new Field("path", path, Field.Store.YES, Field.Index.TOKENIZED));
123N/A Project project = Project.getProject(path);
123N/A if (project != null) {
123N/A doc.add(new Field("project", project.getPath(), Field.Store.YES, Field.Index.TOKENIZED));
58N/A }
143N/A }
0N/A FileAnalyzer fa = null;
0N/A try {
0N/A fa = getAnalyzer(in, path);
0N/A } catch (Exception e) {
0N/A }
0N/A if (fa != null) {
0N/A try {
143N/A Genre g = fa.getGenre();
0N/A if (g == Genre.PLAIN) {
99N/A doc.add(new Field("t", "p", Field.Store.YES, Field.Index.UN_TOKENIZED));
143N/A } else if (g == Genre.XREFABLE) {
99N/A doc.add(new Field("t", "x", Field.Store.YES, Field.Index.UN_TOKENIZED));
143N/A } else if (g == Genre.HTML) {
99N/A doc.add(new Field("t", "h", Field.Store.YES, Field.Index.UN_TOKENIZED));
0N/A }
0N/A fa.analyze(doc, in);
0N/A } catch (Exception e) {
0N/A // Ignoring any errors while analysing
0N/A }
0N/A }
0N/A doc.removeField("fullpath");
143N/A
0N/A return doc;
0N/A }
143N/A
0N/A /**
143N/A * Get the content type for a named file.
143N/A *
143N/A * @param file The file to get the content type for
0N/A * @return The contentType suitable for printing to response.setContentType()
0N/A */
143N/A public static String getContentType(String file) {
143N/A Class<? extends FileAnalyzer> a = find(file);
0N/A return getContentType(a);
0N/A }
143N/A
143N/A /**
143N/A * Get the content type for a named file.
143N/A *
143N/A * @param in The input stream we want to get the content type for (if
143N/A * we cannot determine the content type by the filename)
143N/A * @param file The name of the file
143N/A * @return The contentType suitable for printing to response.setContentType()
143N/A * @throws java.io.IOException If an error occurs while accessing the input
143N/A * stream.
143N/A */
143N/A public static String getContentType(InputStream in, String file) throws IOException {
143N/A Class<? extends FileAnalyzer> a = find(in, file);
143N/A return getContentType(a);
143N/A }
143N/A
143N/A /**
143N/A * Get the content type the named analyzer accepts
143N/A * @param analyzer the analyzer to test
143N/A * @return the contentType suitable for printing to response.setContentType()
143N/A */
143N/A public static String getContentType(Class<? extends FileAnalyzer> analyzer) {
0N/A String contentType = null;
143N/A if (analyzer != null) {
0N/A try {
143N/A contentType = (String) analyzer.getMethod("getContentType").invoke(null);
143N/A } catch (Exception e) {
0N/A }
0N/A }
0N/A return contentType;
0N/A }
143N/A
143N/A /**
143N/A * Write a browsable version of the file
143N/A *
143N/A * @param analyzer The analyzer for this filetype
143N/A * @param in The input stream containing the data
143N/A * @param out Where to write the result
143N/A * @param annotation Annotation information for the file
143N/A * @throws java.io.IOException If an error occurs while creating the
143N/A * output
143N/A */
143N/A public static void writeXref(Class<? extends FileAnalyzer> analyzer, InputStream in, Writer out, Annotation annotation) throws IOException {
143N/A if (analyzer != null) {
0N/A try {
143N/A analyzer.getMethod("writeXref", InputStream.class, Writer.class, Annotation.class).invoke(null, in, out, annotation);
0N/A } catch (IllegalArgumentException ex) {
0N/A } catch (SecurityException ex) {
0N/A } catch (NoSuchMethodException ex) {
0N/A } catch (InvocationTargetException ex) {
0N/A } catch (IllegalAccessException ex) {
0N/A }
0N/A }
0N/A }
143N/A
0N/A /**
143N/A * Get the genre of a file
143N/A *
143N/A * @param file The file to inpect
0N/A * @return The genre suitable to decide how to display the file
0N/A */
143N/A public static Genre getGenre(String file) {
143N/A Class a = find(file);
0N/A return getGenre(a);
0N/A }
143N/A
143N/A /**
143N/A * Get the genre of a file (or the content of the file)
143N/A *
143N/A * @param in The content of the file
143N/A * @param file The file to inpect
143N/A * @return The genre suitable to decide how to display the file
143N/A * @throws java.io.IOException If an error occurs while getting the content
143N/A * of the file
143N/A */
143N/A public static Genre getGenre(InputStream in, String file) throws IOException {
143N/A Class a = find(in, file);
143N/A return getGenre(a);
143N/A }
143N/A
143N/A /**
143N/A * Get the genre of a bulk of data
143N/A *
143N/A * @param in A stream containing the data
143N/A * @return The genre suitable to decide how to display the file
143N/A * @throws java.io.IOException If an error occurs while getting the content
143N/A */
0N/A public static Genre getGenre(InputStream in) throws IOException {
0N/A Class a = find(in);
0N/A return getGenre(a);
0N/A }
143N/A
143N/A /**
143N/A * Get the genre for a named class (this is most likely an analyzer)
143N/A * @param clazz the class to get the genre for
143N/A * @return The genre of this class (null if not found)
143N/A */
143N/A public static Genre getGenre(Class clazz) {
0N/A Genre g = null;
143N/A if (clazz != null) {
0N/A try {
143N/A g = (Genre) clazz.getField("g").get(null);
143N/A } catch (Exception e) {
0N/A e.printStackTrace();
0N/A }
0N/A }
0N/A return g;
0N/A }
143N/A
0N/A /**
143N/A * Finds a suitable analyser class for file name. If the analyzer cannot
143N/A * be determined by the file extension, try to look at the data in the
143N/A * InputStream to find a suitable analyzer.
143N/A *
0N/A * Use if you just want to find file type.
143N/A *
143N/A *
143N/A * @param in The input stream containing the data
143N/A * @param file The file name to get the analyzer for
143N/A * @return The analyzer to use
143N/A * @throws java.io.IOException If a problem occurs while reading the data
0N/A */
143N/A public static Class<? extends FileAnalyzer> find(InputStream in, String file) throws IOException {
143N/A Class<? extends FileAnalyzer> a = find(file);
143N/A if (a == null) {
0N/A a = find(in);
0N/A }
0N/A return a;
0N/A }
143N/A
143N/A /**
143N/A * Finds a suitable analyser class for file name.
143N/A *
143N/A * @param file The file name to get the analyzer for
143N/A * @return The analyzer to use
143N/A */
143N/A public static Class<? extends FileAnalyzer> find(String file) {
0N/A int i = 0;
143N/A if ((i = file.lastIndexOf('/')) > 0 || (i = file.lastIndexOf('\\')) > 0) {
143N/A if (i + 1 < file.length()) {
143N/A file = file.substring(i + 1);
143N/A }
0N/A }
143N/A file = file.toUpperCase();
143N/A int dotpos = file.lastIndexOf('.');
143N/A if (dotpos >= 0) {
143N/A Class<? extends FileAnalyzer> analyzer = ext.get(file.substring(dotpos + 1).toUpperCase());
0N/A if (analyzer != null) {
0N/A //System.err.println(path.substring(dotpos+1).toUpperCase() + " = " + analyzer.getSimpleName());
0N/A return analyzer;
0N/A }
0N/A }
150N/A // file doesn't have any of the extensions we know
150N/A return null;
0N/A }
143N/A
143N/A /**
143N/A * Finds a suitable analyser class for the data in this stream
143N/A *
143N/A * @param in The stream containing the data to analyze
143N/A * @return The analyzer to use
143N/A * @throws java.io.IOException if an error occurs while reading data from
143N/A * the stream
143N/A */
143N/A public static Class<? extends FileAnalyzer> find(InputStream in) throws IOException {
0N/A in.mark(8);
0N/A byte[] content = new byte[8];
0N/A int len = in.read(content);
0N/A in.reset();
143N/A if (len < 4) {
0N/A return null;
143N/A }
36N/A Class<? extends FileAnalyzer> a = find(content);
143N/A if (a == null) {
143N/A for (Method matcher : matchers) {
0N/A try {
0N/A //System.out.println("USING = " + matcher.getName());
36N/A // cannot check conversion because of reflection
143N/A @SuppressWarnings(value = "unchecked")
143N/A Class<? extends FileAnalyzer> c = (Class) matcher.invoke(null, content);
143N/A
36N/A if (c != null) {
36N/A return c;
0N/A }
143N/A } catch (Exception e) {
32N/A e.printStackTrace();
0N/A }
0N/A }
0N/A }
0N/A return a;
0N/A }
143N/A
143N/A /**
143N/A * Finds a suitable analyser class for a magic signature
143N/A *
143N/A * @param signature the magic signature look up
143N/A * @return The analyzer to use
143N/A */
143N/A public static Class<? extends FileAnalyzer> find(byte[] signature) {
143N/A char[] chars = new char[signature.length > 8 ? 8 : signature.length];
143N/A for (int i = 0; i < chars.length; i++) {
143N/A chars[i] = (char) (0xFF & signature[i]);
0N/A }
143N/A return findMagic(new String(chars));
0N/A }
143N/A
143N/A /**
143N/A * Get an analyzer by looking up the "magic signature"
143N/A * @param signature the signature to look up
143N/A * @return The analyzer to handle data with this signature
143N/A */
143N/A public static Class<? extends FileAnalyzer> findMagic(String signature) {
143N/A Class<? extends FileAnalyzer> a = magics.get(signature);
0N/A if (a == null) {
200N/A String sigWithoutBOM = stripBOM(signature);
200N/A for (Map.Entry<String, Class<? extends FileAnalyzer>> entry :
200N/A magics.entrySet()) {
200N/A if (signature.startsWith(entry.getKey())) {
200N/A return entry.getValue();
200N/A }
200N/A // See if text files have the magic sequence if we remove the
200N/A // byte-order marker
200N/A if (sigWithoutBOM != null &&
200N/A getGenre(entry.getValue()) == Genre.PLAIN &&
200N/A sigWithoutBOM.startsWith(entry.getKey())) {
200N/A return entry.getValue();
0N/A }
0N/A }
0N/A }
0N/A return a;
0N/A }
143N/A
200N/A /** Byte-order markers. */
200N/A private static final String[] BOMS = {
200N/A new String(new char[] { 0xEF, 0xBB, 0xBF }), // UTF-8 BOM
200N/A new String(new char[] { 0xFE, 0xFF }), // UTF-16BE BOM
200N/A new String(new char[] { 0xFF, 0xFE }), // UTF-16LE BOM
200N/A };
200N/A
200N/A /**
200N/A * Strip away the byte-order marker from the string, if it has one.
200N/A *
200N/A * @param str the string to remove the BOM from
200N/A * @return a string without the byte-order marker, or <code>null</code> if
200N/A * the string doesn't start with a BOM
200N/A */
200N/A private static String stripBOM(String str) {
200N/A for (String bom : BOMS) {
200N/A if (str.startsWith(bom)) {
200N/A return str.substring(bom.length());
200N/A }
200N/A }
200N/A return null;
200N/A }
200N/A
143N/A public static void main(String[] args) throws Exception {
0N/A AnalyzerGuru af = new AnalyzerGuru();
0N/A System.out.println("<pre wrap=true>");
143N/A for (String arg : args) {
0N/A try {
143N/A Class<? extends FileAnalyzer> an = AnalyzerGuru.find(arg);
0N/A File f = new File(arg);
0N/A BufferedInputStream in = new BufferedInputStream(new FileInputStream(f));
143N/A FileAnalyzer fa = AnalyzerGuru.getAnalyzer(in, arg);
0N/A System.out.println("\nANALYZER = " + fa);
0N/A Document doc = af.getDocument(f, in, arg);
0N/A System.out.println("\nDOCUMENT = " + doc);
143N/A
99N/A Iterator iterator = doc.getFields().iterator();
99N/A while (iterator.hasNext()) {
99N/A org.apache.lucene.document.Field field = (org.apache.lucene.document.Field) iterator.next();
143N/A if (field.isTokenized()) {
0N/A Reader r = field.readerValue();
143N/A if (r == null) {
0N/A r = new StringReader(field.stringValue());
0N/A }
0N/A TokenStream ts = fa.tokenStream(field.name(), r);
143N/A System.out.println("\nFIELD = " + field.name() + " TOKEN STREAM = " + ts.getClass().getName());
0N/A Token t;
143N/A while ((t = ts.next()) != null) {
0N/A System.out.print(t.termText());
0N/A System.out.print(' ');
0N/A }
0N/A System.out.println();
0N/A }
143N/A if (field.isStored()) {
0N/A System.out.println("\nFIELD = " + field.name());
143N/A if (field.readerValue() == null) {
0N/A System.out.println(field.stringValue());
0N/A } else {
0N/A System.out.println("STORING THE READER");
0N/A }
0N/A }
0N/A }
0N/A System.out.println("Writing XREF--------------");
0N/A Writer out = new OutputStreamWriter(System.out);
0N/A fa.writeXref(out);
0N/A out.flush();
0N/A } catch (Exception e) {
0N/A System.err.println("ERROR: " + e.getMessage());
0N/A e.printStackTrace();
0N/A }
0N/A }
0N/A }
148N/A}