AnalyzerGuru.java revision 424
58N/A/*
58N/A * CDDL HEADER START
58N/A *
58N/A * The contents of this file are subject to the terms of the
58N/A * Common Development and Distribution License (the "License").
58N/A * You may not use this file except in compliance with the License.
58N/A *
58N/A * See LICENSE.txt included in this distribution for the specific
58N/A * language governing permissions and limitations under the License.
58N/A *
58N/A * When distributing Covered Code, include this CDDL HEADER in each
58N/A * file and include the License file at LICENSE.txt.
58N/A * If applicable, add the following below this CDDL HEADER, with the
58N/A * fields enclosed by brackets "[]" replaced with your own identifying
58N/A * information: Portions Copyright [yyyy] [name of copyright owner]
58N/A *
58N/A * CDDL HEADER END
58N/A */
58N/A
58N/A/*
1252N/A * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
58N/A * Use is subject to license terms.
58N/A */
58N/Apackage org.opensolaris.opengrok.analysis;
234N/A
234N/Aimport java.io.BufferedInputStream;
234N/Aimport java.io.File;
234N/Aimport java.io.FileInputStream;
639N/Aimport java.io.IOException;
639N/Aimport java.io.InputStream;
234N/Aimport java.io.OutputStreamWriter;
234N/Aimport java.io.Reader;
234N/Aimport java.io.StringReader;
234N/Aimport java.io.Writer;
639N/Aimport java.util.ArrayList;
639N/Aimport java.util.HashMap;
58N/Aimport java.util.Iterator;
1185N/Aimport java.util.List;
667N/Aimport java.util.Locale;
1185N/Aimport java.util.Map;
1016N/Aimport java.util.SortedMap;
58N/Aimport java.util.TreeMap;
1185N/Aimport java.util.logging.Level;
1016N/Aimport org.apache.lucene.analysis.Token;
1185N/Aimport org.apache.lucene.analysis.TokenStream;
664N/Aimport org.apache.lucene.document.DateTools;
1026N/Aimport org.apache.lucene.document.Document;
112N/Aimport org.apache.lucene.document.Field;
1195N/Aimport org.opensolaris.opengrok.OpenGrokLogger;
58N/Aimport org.opensolaris.opengrok.analysis.FileAnalyzer.Genre;
58N/Aimport org.opensolaris.opengrok.analysis.archive.BZip2AnalyzerFactory;
77N/Aimport org.opensolaris.opengrok.analysis.archive.GZIPAnalyzerFactory;
77N/Aimport org.opensolaris.opengrok.analysis.archive.TarAnalyzerFactory;
77N/Aimport org.opensolaris.opengrok.analysis.archive.ZipAnalyzerFactory;
77N/Aimport org.opensolaris.opengrok.analysis.c.CAnalyzerFactory;
58N/Aimport org.opensolaris.opengrok.analysis.data.IgnorantAnalyzerFactory;
418N/Aimport org.opensolaris.opengrok.analysis.data.ImageAnalyzerFactory;
58N/Aimport org.opensolaris.opengrok.analysis.document.TroffAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.analysis.executables.ELFAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.analysis.executables.JarAnalyzerFactory;
58N/Aimport org.opensolaris.opengrok.analysis.executables.JavaClassAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.analysis.java.JavaAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.analysis.lisp.LispAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.analysis.plain.PlainAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.analysis.plain.XMLAnalyzerFactory;
58N/Aimport org.opensolaris.opengrok.analysis.sh.ShAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.analysis.sql.SQLAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.analysis.tcl.TclAnalyzerFactory;
773N/Aimport org.opensolaris.opengrok.configuration.Project;
773N/Aimport org.opensolaris.opengrok.history.Annotation;
58N/Aimport org.opensolaris.opengrok.history.HistoryGuru;
58N/Aimport org.opensolaris.opengrok.history.HistoryReader;
58N/Aimport org.opensolaris.opengrok.web.Util;
664N/A
58N/A/**
65N/A * Manages and porvides Analyzers as needed. Please see
894N/A * <a href="http://www.opensolaris.org/os/project/opengrok/manual/internals/">
77N/A * this</a> page for a great description of the purpose of the AnalyzerGuru.
99N/A *
99N/A * Created on September 22, 2005
1115N/A * @author Chandan
1115N/A */
125N/Apublic class AnalyzerGuru {
112N/A
1026N/A /** The default {@code FileAnalyzerFactory} instance. */
129N/A private static final FileAnalyzerFactory
1100N/A DEFAULT_ANALYZER_FACTORY = new FileAnalyzerFactory();
129N/A
129N/A /** Map from file extensions to analyzer factories. */
318N/A private static final Map<String, FileAnalyzerFactory>
318N/A ext = new HashMap<String, FileAnalyzerFactory>();
144N/A
173N/A // TODO: have a comparator
253N/A /** Map from magic strings to analyzer factories. */
296N/A private static final SortedMap<String, FileAnalyzerFactory>
335N/A magics = new TreeMap<String, FileAnalyzerFactory>();
480N/A
816N/A /**
816N/A * List of matcher objects which can be used to determine which analyzer
833N/A * factory to use.
833N/A */
1185N/A private static final List<FileAnalyzerFactory.Matcher>
1016N/A matchers = new ArrayList<FileAnalyzerFactory.Matcher>();
1123N/A
1125N/A /** List of all registered {@code FileAnalyzerFactory} instances. */
1218N/A private static final List<FileAnalyzerFactory>
1185N/A factories = new ArrayList<FileAnalyzerFactory>();
993N/A
1185N/A /*
1185N/A * If you write your own analyzer please register it here
1190N/A */
1185N/A static {
1185N/A FileAnalyzerFactory[] analyzers = {
1252N/A DEFAULT_ANALYZER_FACTORY,
1185N/A new IgnorantAnalyzerFactory(),
1185N/A new BZip2AnalyzerFactory(),
1185N/A new XMLAnalyzerFactory(),
1185N/A new TroffAnalyzerFactory(),
1185N/A new ELFAnalyzerFactory(),
1185N/A new JavaClassAnalyzerFactory(),
1185N/A new ImageAnalyzerFactory(),
1185N/A JarAnalyzerFactory.DEFAULT_INSTANCE,
1185N/A ZipAnalyzerFactory.DEFAULT_INSTANCE,
1185N/A new TarAnalyzerFactory(),
1252N/A new CAnalyzerFactory(),
1185N/A new ShAnalyzerFactory(),
1185N/A PlainAnalyzerFactory.DEFAULT_INSTANCE,
1185N/A new GZIPAnalyzerFactory(),
1185N/A new JavaAnalyzerFactory(),
1185N/A new LispAnalyzerFactory(),
1185N/A new TclAnalyzerFactory(),
993N/A new SQLAnalyzerFactory(),
993N/A };
993N/A
1185N/A for (FileAnalyzerFactory analyzer : analyzers) {
993N/A registerAnalyzer(analyzer);
993N/A }
937N/A }
58N/A
58N/A /**
816N/A * Register a {@code FileAnalyzerFactory} instance.
58N/A */
58N/A private static void registerAnalyzer(FileAnalyzerFactory factory) {
773N/A for (String suffix : factory.getSuffixes()) {
58N/A FileAnalyzerFactory old = ext.put(suffix, factory);
664N/A assert old == null :
58N/A "suffix '" + suffix + "' used in multiple analyzers";
850N/A }
1118N/A for (String magic : factory.getMagicStrings()) {
870N/A FileAnalyzerFactory old = magics.put(magic, factory);
870N/A assert old == null :
99N/A "magic '" + magic + "' used in multiple analyzers";
1115N/A }
101N/A matchers.addAll(factory.getMatchers());
106N/A factories.add(factory);
112N/A }
1026N/A
129N/A /**
129N/A * Instruct the AnalyzerGuru to use a given analyzer for a given
129N/A * file extension.
875N/A * @param extension the file-extension to add
318N/A * @param factory a factory which creates
144N/A * the analyzer to use for the given extension
173N/A * (if you pass null as the analyzer, you will disable
253N/A * the analyzer used for that extension)
296N/A */
335N/A public static void addExtension(String extension,
480N/A FileAnalyzerFactory factory) {
816N/A if (factory == null) {
816N/A ext.remove(extension);
993N/A } else {
1016N/A ext.put(extension, factory);
1185N/A }
1185N/A }
58N/A
937N/A /**
1185N/A * Get the default Analyzer.
1185N/A */
1185N/A public static FileAnalyzer getAnalyzer() {
1190N/A return DEFAULT_ANALYZER_FACTORY.getAnalyzer();
1185N/A }
1185N/A
1185N/A /**
1185N/A * Get an analyzer suited to analyze a file. This function will reuse
1185N/A * analyzers since they are costly.
1185N/A *
1185N/A * @param in Input stream containing data to be analyzed
1185N/A * @param file Name of the file to be analyzed
1185N/A * @return An analyzer suited for that file content
1185N/A * @throws java.io.IOException If an error occurs while accessing the
1185N/A * data in the input stream.
1185N/A */
1185N/A public static FileAnalyzer getAnalyzer(InputStream in, String file) throws IOException {
1185N/A FileAnalyzerFactory factory = find(in, file);
1190N/A if (factory == null) {
1185N/A return getAnalyzer();
1185N/A }
1185N/A return factory.getAnalyzer();
1185N/A }
1190N/A
58N/A /**
58N/A * Create a Lucene document and fill in the required fields
58N/A * @param file The file to index
937N/A * @param in The data to generate the index for
58N/A * @param path Where the file is located (from source root)
58N/A * @return The Lucene document to add to the index database
58N/A * @throws java.io.IOException If an exception occurs while collecting the
937N/A * datas
816N/A */
816N/A public Document getDocument(File file, InputStream in, String path,
816N/A FileAnalyzer fa) throws IOException {
816N/A Document doc = new Document();
816N/A String date = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND);
816N/A doc.add(new Field("u", Util.uid(path, date), Field.Store.YES, Field.Index.UN_TOKENIZED));
816N/A doc.add(new Field("fullpath", file.getAbsolutePath(), Field.Store.YES, Field.Index.TOKENIZED));
816N/A
816N/A try {
816N/A HistoryReader hr = HistoryGuru.getInstance().getHistoryReader(file);
816N/A if (hr != null) {
816N/A doc.add(new Field("hist", hr));
816N/A // date = hr.getLastCommentDate() //RFE
816N/A }
816N/A } catch (IOException e) {
773N/A OpenGrokLogger.getLogger().log(Level.WARNING, "An error occurred while reading history: ", e);
773N/A }
773N/A doc.add(new Field("date", date, Field.Store.YES, Field.Index.UN_TOKENIZED));
773N/A if (path != null) {
773N/A doc.add(new Field("path", path, Field.Store.YES, Field.Index.TOKENIZED));
773N/A Project project = Project.getProject(path);
58N/A if (project != null) {
58N/A doc.add(new Field("project", project.getPath(), Field.Store.YES, Field.Index.TOKENIZED));
58N/A }
773N/A }
773N/A
773N/A if (fa != null) {
773N/A try {
773N/A Genre g = fa.getGenre();
58N/A if (g == Genre.PLAIN) {
58N/A doc.add(new Field("t", "p", Field.Store.YES, Field.Index.UN_TOKENIZED));
58N/A } else if (g == Genre.XREFABLE) {
773N/A doc.add(new Field("t", "x", Field.Store.YES, Field.Index.UN_TOKENIZED));
773N/A } else if (g == Genre.HTML) {
773N/A doc.add(new Field("t", "h", Field.Store.YES, Field.Index.UN_TOKENIZED));
773N/A }
773N/A fa.analyze(doc, in);
773N/A } catch (Exception e) {
773N/A // Ignoring any errors while analysing
773N/A }
773N/A }
58N/A doc.removeField("fullpath");
58N/A
58N/A return doc;
773N/A }
773N/A
773N/A /**
773N/A * Get the content type for a named file.
773N/A *
773N/A * @param in The input stream we want to get the content type for (if
773N/A * we cannot determine the content type by the filename)
58N/A * @param file The name of the file
58N/A * @return The contentType suitable for printing to response.setContentType() or null
58N/A * if the factory was not found
773N/A * @throws java.io.IOException If an error occurs while accessing the input
773N/A * stream.
773N/A */
773N/A public static String getContentType(InputStream in, String file) throws IOException {
773N/A FileAnalyzerFactory factory = find(in, file);
773N/A String type = null;
773N/A if (factory != null) {
773N/A type = factory.getContentType();
773N/A }
773N/A return type;
773N/A }
773N/A
773N/A /**
773N/A * Write a browsable version of the file
773N/A *
773N/A * @param factory The analyzer factory for this filetype
773N/A * @param in The input stream containing the data
773N/A * @param out Where to write the result
773N/A * @param annotation Annotation information for the file
773N/A * @param project Project the file belongs to
773N/A * @throws java.io.IOException If an error occurs while creating the
773N/A * output
773N/A */
937N/A public static void writeXref(FileAnalyzerFactory factory, InputStream in,
58N/A Writer out, Annotation annotation, Project project)
58N/A throws IOException
58N/A {
937N/A factory.writeXref(in, out, annotation, project);
58N/A }
58N/A
58N/A /**
937N/A * Get the genre of a file
58N/A *
58N/A * @param file The file to inpect
58N/A * @return The genre suitable to decide how to display the file
937N/A */
58N/A public static Genre getGenre(String file) {
58N/A return getGenre(find(file));
58N/A }
937N/A
58N/A /**
58N/A * Get the genre of a bulk of data
58N/A *
937N/A * @param in A stream containing the data
58N/A * @return The genre suitable to decide how to display the file
58N/A * @throws java.io.IOException If an error occurs while getting the content
58N/A */
937N/A public static Genre getGenre(InputStream in) throws IOException {
664N/A return getGenre(find(in));
58N/A }
58N/A
937N/A /**
664N/A * Get the genre for a named class (this is most likely an analyzer)
58N/A * @param factory the analyzer factory to get the genre for
58N/A * @return The genre of this class (null if not found)
937N/A */
58N/A public static Genre getGenre(FileAnalyzerFactory factory) {
58N/A if (factory != null) {
58N/A return factory.getGenre();
937N/A }
1185N/A return null;
1252N/A }
1252N/A
1252N/A /**
1252N/A * Find a {@code FileAnalyzerFactory} with the specified class name. If one
1185N/A * doesn't exist, create one and register it.
1185N/A *
58N/A * @param factoryClassName name of the factory class
58N/A * @return a file analyzer factory
58N/A *
937N/A * @throws ClassNotFoundException if there is no class with that name
65N/A * @throws ClassCastException if the class is not a subclass of {@code
65N/A * FileAnalyzerFactory}
65N/A * @throws IllegalAccessException if the constructor cannot be accessed
937N/A * @throws InstantiationException if the class cannot be instantiated
65N/A */
65N/A public static FileAnalyzerFactory findFactory(String factoryClassName)
65N/A throws ClassNotFoundException, IllegalAccessException,
937N/A InstantiationException
77N/A {
77N/A return findFactory(Class.forName(factoryClassName));
77N/A }
937N/A
77N/A /**
77N/A * Find a {@code FileAnalyzerFactory} which is an instance of the specified
77N/A * class. If one doesn't exist, create one and register it.
99N/A *
99N/A * @param factoryClass the factory class
99N/A * @return a file analyzer factory
99N/A *
99N/A * @throws ClassCastException if the class is not a subclass of {@code
99N/A * FileAnalyzerFactory}
99N/A * @throws IllegalAccessException if the constructor cannot be accessed
99N/A * @throws InstantiationException if the class cannot be instantiated
99N/A */
99N/A private static FileAnalyzerFactory findFactory(Class factoryClass)
99N/A throws InstantiationException, IllegalAccessException
99N/A {
99N/A for (FileAnalyzerFactory f : factories) {
99N/A if (f.getClass() == factoryClass) {
99N/A return f;
99N/A }
937N/A }
1115N/A FileAnalyzerFactory f =
1115N/A (FileAnalyzerFactory) factoryClass.newInstance();
1115N/A registerAnalyzer(f);
1115N/A return f;
1115N/A }
1115N/A
1115N/A /**
1115N/A * Finds a suitable analyser class for file name. If the analyzer cannot
125N/A * be determined by the file extension, try to look at the data in the
125N/A * InputStream to find a suitable analyzer.
125N/A *
937N/A * Use if you just want to find file type.
125N/A *
125N/A *
125N/A * @param in The input stream containing the data
106N/A * @param file The file name to get the analyzer for
106N/A * @return the analyzer factory to use
937N/A * @throws java.io.IOException If a problem occurs while reading the data
106N/A */
106N/A public static FileAnalyzerFactory find(InputStream in, String file)
106N/A throws IOException
937N/A {
106N/A FileAnalyzerFactory factory = find(file);
106N/A if (factory != null) {
106N/A return factory;
112N/A }
112N/A return find(in);
112N/A }
112N/A
112N/A /**
112N/A * Finds a suitable analyser class for file name.
112N/A *
112N/A * @param file The file name to get the analyzer for
129N/A * @return the analyzer factory to use
1026N/A */
1026N/A public static FileAnalyzerFactory find(String file) {
1026N/A int i = 0;
1026N/A if ((i = file.lastIndexOf('/')) > 0 || (i = file.lastIndexOf('\\')) > 0) {
1026N/A if (i + 1 < file.length()) {
1026N/A file = file.substring(i + 1);
1026N/A }
1026N/A }
129N/A file = file.toUpperCase(Locale.US);
129N/A int dotpos = file.lastIndexOf('.');
129N/A if (dotpos >= 0) {
129N/A FileAnalyzerFactory factory =
129N/A ext.get(file.substring(dotpos + 1).toUpperCase());
129N/A if (factory != null) {
129N/A return factory;
129N/A }
1100N/A }
1100N/A // file doesn't have any of the extensions we know
1100N/A return null;
1100N/A }
1100N/A
1100N/A /**
1100N/A * Finds a suitable analyser class for the data in this stream
1100N/A *
129N/A * @param in The stream containing the data to analyze
129N/A * @return the analyzer factory to use
129N/A * @throws java.io.IOException if an error occurs while reading data from
129N/A * the stream
129N/A */
129N/A public static FileAnalyzerFactory find(InputStream in) throws IOException {
129N/A in.mark(8);
129N/A byte[] content = new byte[8];
129N/A int len = in.read(content);
129N/A in.reset();
129N/A if (len < 4) {
129N/A return null;
129N/A }
129N/A
129N/A FileAnalyzerFactory factory = find(content);
937N/A if (factory != null) {
318N/A return factory;
318N/A }
318N/A
318N/A for (FileAnalyzerFactory.Matcher matcher : matchers) {
318N/A FileAnalyzerFactory fac = matcher.isMagic(content, in);
318N/A if (fac != null) {
318N/A return fac;
318N/A }
318N/A }
318N/A
318N/A return null;
318N/A }
318N/A
318N/A /**
318N/A * Finds a suitable analyser class for a magic signature
144N/A *
144N/A * @param signature the magic signature look up
144N/A * @return the analyzer factory to use
144N/A */
144N/A public static FileAnalyzerFactory find(byte[] signature) {
144N/A char[] chars = new char[signature.length > 8 ? 8 : signature.length];
144N/A for (int i = 0; i < chars.length; i++) {
144N/A chars[i] = (char) (0xFF & signature[i]);
173N/A }
173N/A return findMagic(new String(chars));
173N/A }
173N/A
173N/A /**
173N/A * Get an analyzer by looking up the "magic signature"
173N/A * @param signature the signature to look up
173N/A * @return the analyzer factory to handle data with this signature
234N/A */
253N/A public static FileAnalyzerFactory findMagic(String signature) {
253N/A FileAnalyzerFactory a = magics.get(signature);
253N/A if (a == null) {
253N/A String sigWithoutBOM = stripBOM(signature);
253N/A for (Map.Entry<String, FileAnalyzerFactory> entry :
253N/A magics.entrySet()) {
253N/A if (signature.startsWith(entry.getKey())) {
253N/A return entry.getValue();
296N/A }
296N/A // See if text files have the magic sequence if we remove the
296N/A // byte-order marker
296N/A if (sigWithoutBOM != null &&
296N/A entry.getValue().getGenre() == Genre.PLAIN &&
296N/A sigWithoutBOM.startsWith(entry.getKey())) {
296N/A return entry.getValue();
335N/A }
335N/A }
335N/A }
335N/A return a;
335N/A }
335N/A
335N/A /** Byte-order markers. */
335N/A private static final String[] BOMS = {
335N/A new String(new char[] {0xEF, 0xBB, 0xBF}), // UTF-8 BOM
480N/A new String(new char[] {0xFE, 0xFF}), // UTF-16BE BOM
480N/A new String(new char[] {0xFF, 0xFE}), // UTF-16LE BOM
480N/A };
480N/A
480N/A /**
480N/A * Strip away the byte-order marker from the string, if it has one.
480N/A *
937N/A * @param str the string to remove the BOM from
937N/A * @return a string without the byte-order marker, or <code>null</code> if
667N/A * the string doesn't start with a BOM
667N/A */
667N/A private static String stripBOM(String str) {
833N/A for (String bom : BOMS) {
833N/A if (str.startsWith(bom)) {
833N/A return str.substring(bom.length());
833N/A }
833N/A }
833N/A return null;
833N/A }
833N/A}
833N/A