AnalyzerGuru.java revision 1153
0N/A/*
0N/A * CDDL HEADER START
0N/A *
0N/A * The contents of this file are subject to the terms of the
0N/A * Common Development and Distribution License (the "License").
0N/A * You may not use this file except in compliance with the License.
0N/A *
0N/A * See LICENSE.txt included in this distribution for the specific
0N/A * language governing permissions and limitations under the License.
0N/A *
0N/A * When distributing Covered Code, include this CDDL HEADER in each
0N/A * file and include the License file at LICENSE.txt.
0N/A * If applicable, add the following below this CDDL HEADER, with the
0N/A * fields enclosed by brackets "[]" replaced with your own identifying
0N/A * information: Portions Copyright [yyyy] [name of copyright owner]
0N/A *
0N/A * CDDL HEADER END
0N/A */
0N/A
0N/A/*
1072N/A * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
0N/A */
0N/Apackage org.opensolaris.opengrok.analysis;
0N/A
143N/Aimport java.io.File;
143N/Aimport java.io.IOException;
143N/Aimport java.io.InputStream;
921N/Aimport java.io.Reader;
143N/Aimport java.io.Writer;
143N/Aimport java.util.ArrayList;
143N/Aimport java.util.HashMap;
202N/Aimport java.util.List;
421N/Aimport java.util.Locale;
200N/Aimport java.util.Map;
143N/Aimport java.util.SortedMap;
143N/Aimport java.util.TreeMap;
424N/Aimport java.util.logging.Level;
143N/Aimport org.apache.lucene.document.DateTools;
143N/Aimport org.apache.lucene.document.Document;
99N/Aimport org.apache.lucene.document.Field;
424N/Aimport org.opensolaris.opengrok.OpenGrokLogger;
0N/Aimport org.opensolaris.opengrok.analysis.FileAnalyzer.Genre;
202N/Aimport org.opensolaris.opengrok.analysis.archive.BZip2AnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.archive.GZIPAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.archive.TarAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.archive.ZipAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.c.CAnalyzerFactory;
670N/Aimport org.opensolaris.opengrok.analysis.c.CxxAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.data.IgnorantAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.data.ImageAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.document.TroffAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.executables.ELFAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.executables.JarAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.executables.JavaClassAnalyzerFactory;
825N/Aimport org.opensolaris.opengrok.analysis.fortran.FortranAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.java.JavaAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.lisp.LispAnalyzerFactory;
1073N/Aimport org.opensolaris.opengrok.analysis.perl.PerlAnalyzerFactory;
1153N/Aimport org.opensolaris.opengrok.analysis.php.PhpAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.plain.PlainAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.plain.XMLAnalyzerFactory;
1028N/Aimport org.opensolaris.opengrok.analysis.python.PythonAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.sh.ShAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.sql.SQLAnalyzerFactory;
244N/Aimport org.opensolaris.opengrok.analysis.tcl.TclAnalyzerFactory;
58N/Aimport org.opensolaris.opengrok.configuration.Project;
143N/Aimport org.opensolaris.opengrok.history.Annotation;
615N/Aimport org.opensolaris.opengrok.history.HistoryException;
143N/Aimport org.opensolaris.opengrok.history.HistoryGuru;
143N/Aimport org.opensolaris.opengrok.history.HistoryReader;
0N/Aimport org.opensolaris.opengrok.web.Util;
0N/A
0N/A/**
143N/A * Manages and porvides Analyzers as needed. Please see
143N/A * <a href="http://www.opensolaris.org/os/project/opengrok/manual/internals/">
143N/A * this</a> page for a great description of the purpose of the AnalyzerGuru.
143N/A *
0N/A * Created on September 22, 2005
0N/A * @author Chandan
0N/A */
0N/Apublic class AnalyzerGuru {
143N/A
202N/A /** The default {@code FileAnalyzerFactory} instance. */
202N/A private static final FileAnalyzerFactory
202N/A DEFAULT_ANALYZER_FACTORY = new FileAnalyzerFactory();
202N/A
483N/A /** Map from file names to analyzer factories. */
483N/A private static final Map<String, FileAnalyzerFactory>
483N/A FILE_NAMES = new HashMap<String, FileAnalyzerFactory>();
483N/A
202N/A /** Map from file extensions to analyzer factories. */
202N/A private static final Map<String, FileAnalyzerFactory>
202N/A ext = new HashMap<String, FileAnalyzerFactory>();
202N/A
460N/A // @TODO: have a comparator
202N/A /** Map from magic strings to analyzer factories. */
202N/A private static final SortedMap<String, FileAnalyzerFactory>
202N/A magics = new TreeMap<String, FileAnalyzerFactory>();
202N/A
202N/A /**
202N/A * List of matcher objects which can be used to determine which analyzer
202N/A * factory to use.
202N/A */
202N/A private static final List<FileAnalyzerFactory.Matcher>
202N/A matchers = new ArrayList<FileAnalyzerFactory.Matcher>();
202N/A
210N/A /** List of all registered {@code FileAnalyzerFactory} instances. */
210N/A private static final List<FileAnalyzerFactory>
210N/A factories = new ArrayList<FileAnalyzerFactory>();
210N/A
0N/A /*
0N/A * If you write your own analyzer please register it here
0N/A */
36N/A static {
202N/A FileAnalyzerFactory[] analyzers = {
202N/A DEFAULT_ANALYZER_FACTORY,
202N/A new IgnorantAnalyzerFactory(),
202N/A new BZip2AnalyzerFactory(),
202N/A new XMLAnalyzerFactory(),
202N/A new TroffAnalyzerFactory(),
202N/A new ELFAnalyzerFactory(),
202N/A new JavaClassAnalyzerFactory(),
202N/A new ImageAnalyzerFactory(),
257N/A JarAnalyzerFactory.DEFAULT_INSTANCE,
257N/A ZipAnalyzerFactory.DEFAULT_INSTANCE,
202N/A new TarAnalyzerFactory(),
202N/A new CAnalyzerFactory(),
670N/A new CxxAnalyzerFactory(),
202N/A new ShAnalyzerFactory(),
257N/A PlainAnalyzerFactory.DEFAULT_INSTANCE,
202N/A new GZIPAnalyzerFactory(),
202N/A new JavaAnalyzerFactory(),
1028N/A new PythonAnalyzerFactory(),
1072N/A new PerlAnalyzerFactory(),
1153N/A new PhpAnalyzerFactory(),
202N/A new LispAnalyzerFactory(),
244N/A new TclAnalyzerFactory(),
202N/A new SQLAnalyzerFactory(),
825N/A new FortranAnalyzerFactory()
202N/A };
202N/A
202N/A for (FileAnalyzerFactory analyzer : analyzers) {
210N/A registerAnalyzer(analyzer);
0N/A }
0N/A }
126N/A
143N/A /**
210N/A * Register a {@code FileAnalyzerFactory} instance.
210N/A */
210N/A private static void registerAnalyzer(FileAnalyzerFactory factory) {
483N/A for (String name : factory.getFileNames()) {
483N/A FileAnalyzerFactory old = FILE_NAMES.put(name, factory);
483N/A assert old == null :
483N/A "name '" + name + "' used in multiple analyzers";
483N/A }
210N/A for (String suffix : factory.getSuffixes()) {
210N/A FileAnalyzerFactory old = ext.put(suffix, factory);
210N/A assert old == null :
210N/A "suffix '" + suffix + "' used in multiple analyzers";
210N/A }
210N/A for (String magic : factory.getMagicStrings()) {
257N/A FileAnalyzerFactory old = magics.put(magic, factory);
257N/A assert old == null :
257N/A "magic '" + magic + "' used in multiple analyzers";
210N/A }
210N/A matchers.addAll(factory.getMatchers());
210N/A factories.add(factory);
210N/A }
210N/A
210N/A /**
143N/A * Instruct the AnalyzerGuru to use a given analyzer for a given
143N/A * file extension.
143N/A * @param extension the file-extension to add
202N/A * @param factory a factory which creates
202N/A * the analyzer to use for the given extension
143N/A * (if you pass null as the analyzer, you will disable
143N/A * the analyzer used for that extension)
143N/A */
202N/A public static void addExtension(String extension,
202N/A FileAnalyzerFactory factory) {
202N/A if (factory == null) {
202N/A ext.remove(extension);
202N/A } else {
202N/A ext.put(extension, factory);
126N/A }
126N/A }
143N/A
202N/A /**
0N/A * Get the default Analyzer.
0N/A */
0N/A public static FileAnalyzer getAnalyzer() {
202N/A return DEFAULT_ANALYZER_FACTORY.getAnalyzer();
0N/A }
143N/A
143N/A /**
143N/A * Get an analyzer suited to analyze a file. This function will reuse
143N/A * analyzers since they are costly.
143N/A *
143N/A * @param in Input stream containing data to be analyzed
143N/A * @param file Name of the file to be analyzed
143N/A * @return An analyzer suited for that file content
143N/A * @throws java.io.IOException If an error occurs while accessing the
419N/A * data in the input stream.
0N/A */
143N/A public static FileAnalyzer getAnalyzer(InputStream in, String file) throws IOException {
202N/A FileAnalyzerFactory factory = find(in, file);
202N/A if (factory == null) {
202N/A return getAnalyzer();
0N/A }
202N/A return factory.getAnalyzer();
0N/A }
143N/A
143N/A /**
143N/A * Create a Lucene document and fill in the required fields
143N/A * @param file The file to index
143N/A * @param in The data to generate the index for
143N/A * @param path Where the file is located (from source root)
143N/A * @return The Lucene document to add to the index database
143N/A * @throws java.io.IOException If an exception occurs while collecting the
143N/A * datas
143N/A */
224N/A public Document getDocument(File file, InputStream in, String path,
224N/A FileAnalyzer fa) throws IOException {
0N/A Document doc = new Document();
143N/A String date = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND);
816N/A doc.add(new Field("u", Util.uid(path, date), Field.Store.YES, Field.Index.NOT_ANALYZED));
819N/A doc.add(new Field("fullpath", file.getAbsolutePath(), Field.Store.NO, Field.Index.NOT_ANALYZED));
143N/A
143N/A try {
143N/A HistoryReader hr = HistoryGuru.getInstance().getHistoryReader(file);
0N/A if (hr != null) {
99N/A doc.add(new Field("hist", hr));
0N/A // date = hr.getLastCommentDate() //RFE
0N/A }
615N/A } catch (HistoryException e) {
424N/A OpenGrokLogger.getLogger().log(Level.WARNING, "An error occurred while reading history: ", e);
0N/A }
816N/A doc.add(new Field("date", date, Field.Store.YES, Field.Index.NOT_ANALYZED));
143N/A if (path != null) {
816N/A doc.add(new Field("path", path, Field.Store.YES, Field.Index.ANALYZED));
123N/A Project project = Project.getProject(path);
123N/A if (project != null) {
816N/A doc.add(new Field("project", project.getPath(), Field.Store.YES, Field.Index.ANALYZED));
58N/A }
143N/A }
224N/A
0N/A if (fa != null) {
508N/A Genre g = fa.getGenre();
508N/A if (g == Genre.PLAIN) {
816N/A doc.add(new Field("t", "p", Field.Store.YES, Field.Index.NOT_ANALYZED));
508N/A } else if (g == Genre.XREFABLE) {
816N/A doc.add(new Field("t", "x", Field.Store.YES, Field.Index.NOT_ANALYZED));
508N/A } else if (g == Genre.HTML) {
816N/A doc.add(new Field("t", "h", Field.Store.YES, Field.Index.NOT_ANALYZED));
0N/A }
508N/A fa.analyze(doc, in);
0N/A }
143N/A
0N/A return doc;
0N/A }
143N/A
0N/A /**
143N/A * Get the content type for a named file.
143N/A *
143N/A * @param in The input stream we want to get the content type for (if
143N/A * we cannot determine the content type by the filename)
143N/A * @param file The name of the file
216N/A * @return The contentType suitable for printing to response.setContentType() or null
216N/A * if the factory was not found
143N/A * @throws java.io.IOException If an error occurs while accessing the input
143N/A * stream.
143N/A */
143N/A public static String getContentType(InputStream in, String file) throws IOException {
216N/A FileAnalyzerFactory factory = find(in, file);
216N/A String type = null;
395N/A if (factory != null) {
216N/A type = factory.getContentType();
395N/A }
216N/A return type;
0N/A }
143N/A
143N/A /**
143N/A * Write a browsable version of the file
143N/A *
202N/A * @param factory The analyzer factory for this filetype
143N/A * @param in The input stream containing the data
143N/A * @param out Where to write the result
1127N/A * @param defs definitions for the source file, if available
143N/A * @param annotation Annotation information for the file
271N/A * @param project Project the file belongs to
143N/A * @throws java.io.IOException If an error occurs while creating the
143N/A * output
143N/A */
921N/A public static void writeXref(FileAnalyzerFactory factory, Reader in,
1127N/A Writer out, Definitions defs,
1127N/A Annotation annotation, Project project)
202N/A throws IOException
202N/A {
922N/A Reader input = in;
922N/A if (factory.getGenre() == Genre.PLAIN) {
922N/A // This is some kind of text file, so we need to expand tabs to
922N/A // spaces to match the project's tab settings.
922N/A input = ExpandTabsReader.wrap(in, project);
922N/A }
1127N/A factory.writeXref(input, out, defs, annotation, project);
0N/A }
143N/A
0N/A /**
143N/A * Get the genre of a file
143N/A *
143N/A * @param file The file to inpect
0N/A * @return The genre suitable to decide how to display the file
0N/A */
143N/A public static Genre getGenre(String file) {
202N/A return getGenre(find(file));
143N/A }
143N/A
143N/A /**
143N/A * Get the genre of a bulk of data
143N/A *
143N/A * @param in A stream containing the data
143N/A * @return The genre suitable to decide how to display the file
143N/A * @throws java.io.IOException If an error occurs while getting the content
143N/A */
0N/A public static Genre getGenre(InputStream in) throws IOException {
202N/A return getGenre(find(in));
0N/A }
143N/A
143N/A /**
143N/A * Get the genre for a named class (this is most likely an analyzer)
202N/A * @param factory the analyzer factory to get the genre for
143N/A * @return The genre of this class (null if not found)
143N/A */
202N/A public static Genre getGenre(FileAnalyzerFactory factory) {
202N/A if (factory != null) {
202N/A return factory.getGenre();
0N/A }
202N/A return null;
0N/A }
143N/A
0N/A /**
210N/A * Find a {@code FileAnalyzerFactory} with the specified class name. If one
210N/A * doesn't exist, create one and register it.
210N/A *
210N/A * @param factoryClassName name of the factory class
210N/A * @return a file analyzer factory
210N/A *
210N/A * @throws ClassNotFoundException if there is no class with that name
210N/A * @throws ClassCastException if the class is not a subclass of {@code
210N/A * FileAnalyzerFactory}
210N/A * @throws IllegalAccessException if the constructor cannot be accessed
210N/A * @throws InstantiationException if the class cannot be instantiated
210N/A */
210N/A public static FileAnalyzerFactory findFactory(String factoryClassName)
210N/A throws ClassNotFoundException, IllegalAccessException,
210N/A InstantiationException
210N/A {
210N/A return findFactory(Class.forName(factoryClassName));
210N/A }
210N/A
210N/A /**
210N/A * Find a {@code FileAnalyzerFactory} which is an instance of the specified
210N/A * class. If one doesn't exist, create one and register it.
210N/A *
210N/A * @param factoryClass the factory class
210N/A * @return a file analyzer factory
210N/A *
210N/A * @throws ClassCastException if the class is not a subclass of {@code
210N/A * FileAnalyzerFactory}
210N/A * @throws IllegalAccessException if the constructor cannot be accessed
210N/A * @throws InstantiationException if the class cannot be instantiated
210N/A */
210N/A private static FileAnalyzerFactory findFactory(Class factoryClass)
210N/A throws InstantiationException, IllegalAccessException
210N/A {
210N/A for (FileAnalyzerFactory f : factories) {
210N/A if (f.getClass() == factoryClass) {
210N/A return f;
210N/A }
210N/A }
210N/A FileAnalyzerFactory f =
210N/A (FileAnalyzerFactory) factoryClass.newInstance();
210N/A registerAnalyzer(f);
210N/A return f;
210N/A }
210N/A
210N/A /**
143N/A * Finds a suitable analyser class for file name. If the analyzer cannot
143N/A * be determined by the file extension, try to look at the data in the
143N/A * InputStream to find a suitable analyzer.
143N/A *
0N/A * Use if you just want to find file type.
143N/A *
143N/A *
143N/A * @param in The input stream containing the data
143N/A * @param file The file name to get the analyzer for
202N/A * @return the analyzer factory to use
143N/A * @throws java.io.IOException If a problem occurs while reading the data
0N/A */
202N/A public static FileAnalyzerFactory find(InputStream in, String file)
202N/A throws IOException
202N/A {
202N/A FileAnalyzerFactory factory = find(file);
1072N/A //TODO above is not that great, since if 2 analyzers share one extension
1072N/A //then only the first one registered will own it
1072N/A //it would be cool if above could return more analyzers and below would
1072N/A //then decide between them ...
202N/A if (factory != null) {
202N/A return factory;
0N/A }
202N/A return find(in);
0N/A }
143N/A
143N/A /**
143N/A * Finds a suitable analyser class for file name.
143N/A *
143N/A * @param file The file name to get the analyzer for
202N/A * @return the analyzer factory to use
143N/A */
202N/A public static FileAnalyzerFactory find(String file) {
440N/A String path = file;
0N/A int i = 0;
460N/A if (((i = path.lastIndexOf('/')) > 0 || (i = path.lastIndexOf('\\')) > 0)
460N/A && (i + 1 < path.length())) {
460N/A path = path.substring(i + 1);
0N/A }
440N/A int dotpos = path.lastIndexOf('.');
143N/A if (dotpos >= 0) {
202N/A FileAnalyzerFactory factory =
1072N/A ext.get(path.substring(dotpos + 1).toUpperCase(Locale.getDefault()));
202N/A if (factory != null) {
202N/A return factory;
0N/A }
0N/A }
483N/A // file doesn't have any of the extensions we know, try full match
1072N/A return FILE_NAMES.get(path.toUpperCase(Locale.getDefault()));
0N/A }
143N/A
143N/A /**
143N/A * Finds a suitable analyser class for the data in this stream
143N/A *
143N/A * @param in The stream containing the data to analyze
202N/A * @return the analyzer factory to use
143N/A * @throws java.io.IOException if an error occurs while reading data from
143N/A * the stream
143N/A */
202N/A public static FileAnalyzerFactory find(InputStream in) throws IOException {
0N/A in.mark(8);
0N/A byte[] content = new byte[8];
0N/A int len = in.read(content);
0N/A in.reset();
143N/A if (len < 4) {
0N/A return null;
143N/A }
143N/A
202N/A FileAnalyzerFactory factory = find(content);
202N/A if (factory != null) {
202N/A return factory;
202N/A }
202N/A
202N/A for (FileAnalyzerFactory.Matcher matcher : matchers) {
257N/A FileAnalyzerFactory fac = matcher.isMagic(content, in);
202N/A if (fac != null) {
202N/A return fac;
0N/A }
0N/A }
202N/A
202N/A return null;
0N/A }
143N/A
143N/A /**
143N/A * Finds a suitable analyser class for a magic signature
143N/A *
143N/A * @param signature the magic signature look up
202N/A * @return the analyzer factory to use
143N/A */
956N/A private static FileAnalyzerFactory find(byte[] signature)
956N/A throws IOException {
956N/A // XXX this assumes ISO-8859-1 encoding (and should work in most cases
956N/A // for US-ASCII, UTF-8 and other ISO-8859-* encodings, but not always),
956N/A // we should try to be smarter than this...
143N/A char[] chars = new char[signature.length > 8 ? 8 : signature.length];
143N/A for (int i = 0; i < chars.length; i++) {
143N/A chars[i] = (char) (0xFF & signature[i]);
0N/A }
143N/A
956N/A String sig = new String(chars);
956N/A
956N/A FileAnalyzerFactory a = magics.get(sig);
0N/A if (a == null) {
200N/A String sigWithoutBOM = stripBOM(signature);
202N/A for (Map.Entry<String, FileAnalyzerFactory> entry :
200N/A magics.entrySet()) {
956N/A if (sig.startsWith(entry.getKey())) {
200N/A return entry.getValue();
200N/A }
200N/A // See if text files have the magic sequence if we remove the
200N/A // byte-order marker
200N/A if (sigWithoutBOM != null &&
202N/A entry.getValue().getGenre() == Genre.PLAIN &&
200N/A sigWithoutBOM.startsWith(entry.getKey())) {
200N/A return entry.getValue();
0N/A }
0N/A }
0N/A }
0N/A return a;
0N/A }
143N/A
200N/A /** Byte-order markers. */
956N/A private static final Map<String, byte[]> BOMS =
956N/A new HashMap<String, byte[]>();
956N/A static {
956N/A BOMS.put("UTF-8", new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF});
956N/A BOMS.put("UTF-16BE", new byte[] {(byte) 0xFE, (byte) 0xFF});
956N/A BOMS.put("UTF-16LE", new byte[] {(byte) 0xFF, (byte) 0xFE});
200N/A };
200N/A
200N/A /**
200N/A * Strip away the byte-order marker from the string, if it has one.
200N/A *
956N/A * @param sig a sequence of bytes from which to remove the BOM
200N/A * @return a string without the byte-order marker, or <code>null</code> if
200N/A * the string doesn't start with a BOM
200N/A */
956N/A public static String stripBOM(byte[] sig) throws IOException {
956N/A for (Map.Entry<String, byte[]> entry : BOMS.entrySet()) {
956N/A String encoding = entry.getKey();
956N/A byte[] bom = entry.getValue();
956N/A if (sig.length > bom.length) {
956N/A int i = 0;
956N/A while (i < bom.length && sig[i] == bom[i]) {
956N/A i++;
956N/A }
956N/A if (i == bom.length) {
956N/A // BOM matched beginning of signature
956N/A return new String(
956N/A sig,
956N/A bom.length, // offset
956N/A sig.length - bom.length, // length
956N/A encoding);
956N/A }
200N/A }
200N/A }
200N/A return null;
200N/A }
148N/A}