opengrok/analysis/AnalyzerGuru.java

	AnalyzerGuru.java revision 224
0N/A/*
0N/A * CDDL HEADER START
0N/A *
0N/A * The contents of this file are subject to the terms of the
0N/A * Common Development and Distribution License (the "License").
0N/A * You may not use this file except in compliance with the License.
0N/A *
0N/A * See LICENSE.txt included in this distribution for the specific
0N/A * language governing permissions and limitations under the License.
0N/A *
0N/A * When distributing Covered Code, include this CDDL HEADER in each
0N/A * file and include the License file at LICENSE.txt.
0N/A * If applicable, add the following below this CDDL HEADER, with the
0N/A * fields enclosed by brackets "[]" replaced with your own identifying
0N/A * information: Portions Copyright [yyyy] [name of copyright owner]
0N/A *
0N/A * CDDL HEADER END
0N/A */
0N/A
0N/A/*
143N/A * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
0N/A * Use is subject to license terms.
0N/A */
0N/Apackage org.opensolaris.opengrok.analysis;
0N/A
143N/Aimport java.io.BufferedInputStream;
143N/Aimport java.io.File;
143N/Aimport java.io.FileInputStream;
143N/Aimport java.io.IOException;
143N/Aimport java.io.InputStream;
143N/Aimport java.io.OutputStreamWriter;
143N/Aimport java.io.Reader;
143N/Aimport java.io.StringReader;
143N/Aimport java.io.Writer;
143N/Aimport java.util.ArrayList;
143N/Aimport java.util.HashMap;
143N/Aimport java.util.Iterator;
202N/Aimport java.util.List;
200N/Aimport java.util.Map;
143N/Aimport java.util.SortedMap;
143N/Aimport java.util.TreeMap;
143N/Aimport org.apache.lucene.analysis.Token;
143N/Aimport org.apache.lucene.analysis.TokenStream;
143N/Aimport org.apache.lucene.document.DateTools;
143N/Aimport org.apache.lucene.document.Document;
99N/Aimport org.apache.lucene.document.Field;
0N/Aimport org.opensolaris.opengrok.analysis.FileAnalyzer.Genre;
202N/Aimport org.opensolaris.opengrok.analysis.archive.BZip2AnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.archive.GZIPAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.archive.TarAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.archive.ZipAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.c.CAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.data.IgnorantAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.data.ImageAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.document.TroffAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.executables.ELFAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.executables.JarAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.executables.JavaClassAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.java.JavaAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.lisp.LispAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.plain.PlainAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.plain.XMLAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.sh.ShAnalyzerFactory;
202N/Aimport org.opensolaris.opengrok.analysis.sql.SQLAnalyzerFactory;
58N/Aimport org.opensolaris.opengrok.configuration.Project;
143N/Aimport org.opensolaris.opengrok.history.Annotation;
143N/Aimport org.opensolaris.opengrok.history.HistoryGuru;
143N/Aimport org.opensolaris.opengrok.history.HistoryReader;
0N/Aimport org.opensolaris.opengrok.web.Util;
0N/A
0N/A/**
143N/A * Manages and porvides Analyzers as needed. Please see
143N/A * <a href="http://www.opensolaris.org/os/project/opengrok/manual/internals/">
143N/A * this</a> page for a great description of the purpose of the AnalyzerGuru.
143N/A *
0N/A * Created on September 22, 2005
0N/A * @author Chandan
0N/A */
0N/Apublic class AnalyzerGuru {
143N/A
202N/A    /** The default {@code FileAnalyzerFactory} instance. */
202N/A    private static final FileAnalyzerFactory
202N/A        DEFAULT_ANALYZER_FACTORY = new FileAnalyzerFactory();
202N/A
202N/A    /** Map from file extensions to analyzer factories. */
202N/A    private static final Map<String, FileAnalyzerFactory>
202N/A        ext = new HashMap<String, FileAnalyzerFactory>();
202N/A
202N/A    // TODO: have a comparator
202N/A    /** Map from magic strings to analyzer factories. */
202N/A    private static final SortedMap<String, FileAnalyzerFactory>
202N/A        magics = new TreeMap<String, FileAnalyzerFactory>();
202N/A
202N/A    /**
202N/A     * List of matcher objects which can be used to determine which analyzer
202N/A     * factory to use.
202N/A     */
202N/A    private static final List<FileAnalyzerFactory.Matcher>
202N/A        matchers = new ArrayList<FileAnalyzerFactory.Matcher>();
202N/A
210N/A    /** List of all registered {@code FileAnalyzerFactory} instances. */
210N/A    private static final List<FileAnalyzerFactory>
210N/A        factories = new ArrayList<FileAnalyzerFactory>();
210N/A
0N/A    /*
0N/A     * If you write your own analyzer please register it here
0N/A     */
36N/A    static {
202N/A        FileAnalyzerFactory[] analyzers = {
202N/A            DEFAULT_ANALYZER_FACTORY,
202N/A            new IgnorantAnalyzerFactory(),
202N/A            new BZip2AnalyzerFactory(),
202N/A            new XMLAnalyzerFactory(),
202N/A            new TroffAnalyzerFactory(),
202N/A            new ELFAnalyzerFactory(),
202N/A            new JavaClassAnalyzerFactory(),
202N/A            new ImageAnalyzerFactory(),
202N/A            new JarAnalyzerFactory(),
202N/A            new ZipAnalyzerFactory(),
202N/A            new TarAnalyzerFactory(),
202N/A            new CAnalyzerFactory(),
202N/A            new ShAnalyzerFactory(),
202N/A            new PlainAnalyzerFactory(),
202N/A            new GZIPAnalyzerFactory(),
202N/A            new JavaAnalyzerFactory(),
202N/A            new LispAnalyzerFactory(),
202N/A            new SQLAnalyzerFactory(),
202N/A        };
202N/A
202N/A        for (FileAnalyzerFactory analyzer : analyzers) {
210N/A            registerAnalyzer(analyzer);
0N/A        }
0N/A    }
126N/A
143N/A    /**
210N/A     * Register a {@code FileAnalyzerFactory} instance.
210N/A     */
210N/A    private static void registerAnalyzer(FileAnalyzerFactory factory) {
210N/A        for (String suffix : factory.getSuffixes()) {
210N/A            FileAnalyzerFactory old = ext.put(suffix, factory);
210N/A            assert old == null :
210N/A            "suffix '" + suffix + "' used in multiple analyzers";
210N/A        }
210N/A        for (String magic : factory.getMagicStrings()) {
210N/A            magics.put(magic, factory);
210N/A        }
210N/A        matchers.addAll(factory.getMatchers());
210N/A        factories.add(factory);
210N/A    }
210N/A
210N/A    /**
143N/A     *  Instruct the AnalyzerGuru to use a given analyzer for a given
143N/A     *  file extension.
143N/A     *  @param extension the file-extension to add
202N/A     *  @param factory   a factory which creates
202N/A     *                   the analyzer to use for the given extension
143N/A     *                  (if you pass null as the analyzer, you will disable
143N/A     *                   the analyzer used for that extension)
143N/A     */
202N/A    public static void addExtension(String extension,
202N/A                                    FileAnalyzerFactory factory) {
202N/A        if (factory == null) {
202N/A            ext.remove(extension);
202N/A        } else {
202N/A            ext.put(extension, factory);
126N/A        }
126N/A    }
143N/A
202N/A    /**
0N/A     * Get the default Analyzer.
0N/A     */
0N/A    public static FileAnalyzer getAnalyzer() {
202N/A        return DEFAULT_ANALYZER_FACTORY.getAnalyzer();
0N/A    }
143N/A
143N/A    /**
143N/A     * Get an analyzer suited to analyze a file. This function will reuse
143N/A     * analyzers since they are costly.
143N/A     *
143N/A     * @param in Input stream containing data to be analyzed
143N/A     * @param file Name of the file to be analyzed
143N/A     * @return An analyzer suited for that file content
143N/A     * @throws java.io.IOException If an error occurs while accessing the
143N/A     *                             data in the input stream.
0N/A     */
143N/A    public static FileAnalyzer getAnalyzer(InputStream in, String file) throws IOException {
202N/A        FileAnalyzerFactory factory = find(in, file);
202N/A        if (factory == null) {
202N/A            return getAnalyzer();
0N/A        }
202N/A        return factory.getAnalyzer();
0N/A    }
143N/A
143N/A    /**
143N/A     * Create a Lucene document and fill in the required fields
143N/A     * @param file The file to index
143N/A     * @param in The data to generate the index for
143N/A     * @param path Where the file is located (from source root)
143N/A     * @return The Lucene document to add to the index database
143N/A     * @throws java.io.IOException If an exception occurs while collecting the
143N/A     *                             datas
143N/A     */
224N/A    public Document getDocument(File file, InputStream in, String path,
224N/A                                FileAnalyzer fa) throws IOException {
0N/A        Document doc = new Document();
143N/A        String date = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND);
99N/A        doc.add(new Field("u", Util.uid(path, date), Field.Store.YES, Field.Index.UN_TOKENIZED));
143N/A        doc.add(new Field("fullpath", file.getAbsolutePath(), Field.Store.YES, Field.Index.TOKENIZED));
143N/A
143N/A        try {
143N/A            HistoryReader hr = HistoryGuru.getInstance().getHistoryReader(file);
0N/A            if (hr != null) {
99N/A                doc.add(new Field("hist", hr));
0N/A                // date = hr.getLastCommentDate() //RFE
0N/A            }
0N/A        } catch (IOException e) {
99N/A            e.printStackTrace();
0N/A        }
99N/A        doc.add(new Field("date", date, Field.Store.YES, Field.Index.UN_TOKENIZED));
143N/A        if (path != null) {
99N/A            doc.add(new Field("path", path, Field.Store.YES, Field.Index.TOKENIZED));
123N/A            Project project = Project.getProject(path);
123N/A            if (project != null) {
123N/A                doc.add(new Field("project", project.getPath(), Field.Store.YES, Field.Index.TOKENIZED));
58N/A            }
143N/A        }
224N/A
0N/A        if (fa != null) {
0N/A            try {
143N/A                Genre g = fa.getGenre();
0N/A                if (g == Genre.PLAIN) {
99N/A                    doc.add(new Field("t", "p", Field.Store.YES, Field.Index.UN_TOKENIZED));
143N/A                } else if (g == Genre.XREFABLE) {
99N/A                    doc.add(new Field("t", "x", Field.Store.YES, Field.Index.UN_TOKENIZED));
143N/A                } else if (g == Genre.HTML) {
99N/A                    doc.add(new Field("t", "h", Field.Store.YES, Field.Index.UN_TOKENIZED));
0N/A                }
0N/A                fa.analyze(doc, in);
0N/A            } catch (Exception e) {
0N/A                // Ignoring any errors while analysing
0N/A            }
0N/A        }
0N/A        doc.removeField("fullpath");
143N/A
0N/A        return doc;
0N/A    }
143N/A
0N/A    /**
143N/A     * Get the content type for a named file.
143N/A     *
143N/A     * @param in The input stream we want to get the content type for (if
143N/A     *           we cannot determine the content type by the filename)
143N/A     * @param file The name of the file
216N/A     * @return The contentType suitable for printing to response.setContentType() or null
216N/A     *         if the factory was not found
143N/A     * @throws java.io.IOException If an error occurs while accessing the input
143N/A     *                             stream.
143N/A     */
143N/A    public static String getContentType(InputStream in, String file) throws IOException {
216N/A        FileAnalyzerFactory factory = find(in, file);
216N/A        String type = null;
216N/A        if (factory != null)
216N/A            type = factory.getContentType();
216N/A        return type;
0N/A    }
143N/A
143N/A    /**
143N/A     * Write a browsable version of the file
143N/A     *
202N/A     * @param factory The analyzer factory for this filetype
143N/A     * @param in The input stream containing the data
143N/A     * @param out Where to write the result
143N/A     * @param annotation Annotation information for the file
143N/A     * @throws java.io.IOException If an error occurs while creating the
143N/A     *                             output
143N/A     */
202N/A    public static void writeXref(FileAnalyzerFactory factory, InputStream in,
202N/A                                 Writer out, Annotation annotation)
202N/A        throws IOException
202N/A    {
202N/A        factory.writeXref(in, out, annotation);
0N/A    }
143N/A
0N/A    /**
143N/A     * Get the genre of a file
143N/A     *
143N/A     * @param file The file to inpect
0N/A     * @return The genre suitable to decide how to display the file
0N/A     */
143N/A    public static Genre getGenre(String file) {
202N/A        return getGenre(find(file));
143N/A    }
143N/A
143N/A    /**
143N/A     * Get the genre of a bulk of data
143N/A     *
143N/A     * @param in A stream containing the data
143N/A     * @return The genre suitable to decide how to display the file
143N/A     * @throws java.io.IOException If an error occurs while getting the content
143N/A     */
0N/A    public static Genre getGenre(InputStream in) throws IOException {
202N/A        return getGenre(find(in));
0N/A    }
143N/A
143N/A    /**
143N/A     * Get the genre for a named class (this is most likely an analyzer)
202N/A     * @param factory the analyzer factory to get the genre for
143N/A     * @return The genre of this class (null if not found)
143N/A     */
202N/A    public static Genre getGenre(FileAnalyzerFactory factory) {
202N/A        if (factory != null) {
202N/A            return factory.getGenre();
0N/A        }
202N/A        return null;
0N/A    }
143N/A
0N/A    /**
210N/A     * Find a {@code FileAnalyzerFactory} with the specified class name. If one
210N/A     * doesn't exist, create one and register it.
210N/A     *
210N/A     * @param factoryClassName name of the factory class
210N/A     * @return a file analyzer factory
210N/A     *
210N/A     * @throws ClassNotFoundException if there is no class with that name
210N/A     * @throws ClassCastException if the class is not a subclass of {@code
210N/A     * FileAnalyzerFactory}
210N/A     * @throws IllegalAccessException if the constructor cannot be accessed
210N/A     * @throws InstantiationException if the class cannot be instantiated
210N/A     */
210N/A    public static FileAnalyzerFactory findFactory(String factoryClassName)
210N/A        throws ClassNotFoundException, IllegalAccessException,
210N/A               InstantiationException
210N/A    {
210N/A        return findFactory(Class.forName(factoryClassName));
210N/A    }
210N/A
210N/A    /**
210N/A     * Find a {@code FileAnalyzerFactory} which is an instance of the specified
210N/A     * class. If one doesn't exist, create one and register it.
210N/A     *
210N/A     * @param factoryClass the factory class
210N/A     * @return a file analyzer factory
210N/A     *
210N/A     * @throws ClassCastException if the class is not a subclass of {@code
210N/A     * FileAnalyzerFactory}
210N/A     * @throws IllegalAccessException if the constructor cannot be accessed
210N/A     * @throws InstantiationException if the class cannot be instantiated
210N/A     */
210N/A    private static FileAnalyzerFactory findFactory(Class factoryClass)
210N/A        throws InstantiationException, IllegalAccessException
210N/A    {
210N/A        for (FileAnalyzerFactory f : factories) {
210N/A            if (f.getClass() == factoryClass) {
210N/A                return f;
210N/A            }
210N/A        }
210N/A        FileAnalyzerFactory f =
210N/A            (FileAnalyzerFactory) factoryClass.newInstance();
210N/A        registerAnalyzer(f);
210N/A        return f;
210N/A    }
210N/A
210N/A    /**
143N/A     * Finds a suitable analyser class for file name. If the analyzer cannot
143N/A     * be determined by the file extension, try to look at the data in the
143N/A     * InputStream to find a suitable analyzer.
143N/A     *
0N/A     * Use if you just want to find file type.
143N/A     *
143N/A     *
143N/A     * @param in The input stream containing the data
143N/A     * @param file The file name to get the analyzer for
202N/A     * @return the analyzer factory to use
143N/A     * @throws java.io.IOException If a problem occurs while reading the data
0N/A     */
202N/A    public static FileAnalyzerFactory find(InputStream in, String file)
202N/A        throws IOException
202N/A    {
202N/A        FileAnalyzerFactory factory = find(file);
202N/A        if (factory != null) {
202N/A            return factory;
0N/A        }
202N/A        return find(in);
0N/A    }
143N/A
143N/A    /**
143N/A     * Finds a suitable analyser class for file name.
143N/A     *
143N/A     * @param file The file name to get the analyzer for
202N/A     * @return the analyzer factory to use
143N/A     */
202N/A    public static FileAnalyzerFactory find(String file) {
0N/A        int i = 0;
143N/A        if ((i = file.lastIndexOf('/')) > 0 || (i = file.lastIndexOf('\\')) > 0) {
143N/A            if (i + 1 < file.length()) {
143N/A                file = file.substring(i + 1);
143N/A            }
0N/A        }
143N/A        file = file.toUpperCase();
143N/A        int dotpos = file.lastIndexOf('.');
143N/A        if (dotpos >= 0) {
202N/A            FileAnalyzerFactory factory =
202N/A                ext.get(file.substring(dotpos + 1).toUpperCase());
202N/A            if (factory != null) {
202N/A                return factory;
0N/A            }
0N/A        }
150N/A        // file doesn't have any of the extensions we know
150N/A        return null;
0N/A    }
143N/A
143N/A    /**
143N/A     * Finds a suitable analyser class for the data in this stream
143N/A     *
143N/A     * @param in The stream containing the data to analyze
202N/A     * @return the analyzer factory to use
143N/A     * @throws java.io.IOException if an error occurs while reading data from
143N/A     *                             the stream
143N/A     */
202N/A    public static FileAnalyzerFactory find(InputStream in) throws IOException {
0N/A        in.mark(8);
0N/A        byte[] content = new byte[8];
0N/A        int len = in.read(content);
0N/A        in.reset();
143N/A        if (len < 4) {
0N/A            return null;
143N/A        }
143N/A
202N/A        FileAnalyzerFactory factory = find(content);
202N/A        if (factory != null) {
202N/A            return factory;
202N/A        }
202N/A
202N/A        for (FileAnalyzerFactory.Matcher matcher : matchers) {
202N/A            FileAnalyzerFactory fac = matcher.isMagic(content);
202N/A            if (fac != null) {
202N/A                return fac;
0N/A            }
0N/A        }
202N/A
202N/A        return null;
0N/A    }
143N/A
143N/A    /**
143N/A     * Finds a suitable analyser class for a magic signature
143N/A     *
143N/A     * @param signature the magic signature look up
202N/A     * @return the analyzer factory to use
143N/A     */
202N/A    public static FileAnalyzerFactory find(byte[] signature) {
143N/A        char[] chars = new char[signature.length > 8 ? 8 : signature.length];
143N/A        for (int i = 0; i < chars.length; i++) {
143N/A            chars[i] = (char) (0xFF & signature[i]);
0N/A        }
143N/A        return findMagic(new String(chars));
0N/A    }
143N/A
143N/A    /**
143N/A     * Get an analyzer by looking up the "magic signature"
143N/A     * @param signature the signature to look up
202N/A     * @return the analyzer factory to handle data with this signature
143N/A     */
202N/A    public static FileAnalyzerFactory findMagic(String signature) {
202N/A        FileAnalyzerFactory a = magics.get(signature);
0N/A        if (a == null) {
200N/A            String sigWithoutBOM = stripBOM(signature);
202N/A            for (Map.Entry<String, FileAnalyzerFactory> entry :
200N/A                     magics.entrySet()) {
200N/A                if (signature.startsWith(entry.getKey())) {
200N/A                    return entry.getValue();
200N/A                }
200N/A                // See if text files have the magic sequence if we remove the
200N/A                // byte-order marker
200N/A                if (sigWithoutBOM != null &&
202N/A                        entry.getValue().getGenre() == Genre.PLAIN &&
200N/A                        sigWithoutBOM.startsWith(entry.getKey())) {
200N/A                    return entry.getValue();
0N/A                }
0N/A            }
0N/A        }
0N/A        return a;
0N/A    }
143N/A
200N/A    /** Byte-order markers. */
200N/A    private static final String[] BOMS = {
200N/A        new String(new char[] { 0xEF, 0xBB, 0xBF }), // UTF-8 BOM
200N/A        new String(new char[] { 0xFE, 0xFF }),       // UTF-16BE BOM
200N/A        new String(new char[] { 0xFF, 0xFE }),       // UTF-16LE BOM
200N/A    };
200N/A
200N/A    /**
200N/A     * Strip away the byte-order marker from the string, if it has one.
200N/A     *
200N/A     * @param str the string to remove the BOM from
200N/A     * @return a string without the byte-order marker, or <code>null</code> if
200N/A     * the string doesn't start with a BOM
200N/A     */
200N/A    private static String stripBOM(String str) {
200N/A        for (String bom : BOMS) {
200N/A            if (str.startsWith(bom)) {
200N/A                return str.substring(bom.length());
200N/A            }
200N/A        }
200N/A        return null;
200N/A    }
200N/A
143N/A    public static void main(String[] args) throws Exception {
0N/A        AnalyzerGuru af = new AnalyzerGuru();
0N/A        System.out.println("<pre wrap=true>");
143N/A        for (String arg : args) {
0N/A            try {
202N/A                FileAnalyzerFactory an = AnalyzerGuru.find(arg);
0N/A                File f = new File(arg);
0N/A                BufferedInputStream in = new BufferedInputStream(new FileInputStream(f));
143N/A                FileAnalyzer fa = AnalyzerGuru.getAnalyzer(in, arg);
0N/A                System.out.println("\nANALYZER = " + fa);
224N/A                Document doc = af.getDocument(f, in, arg, fa);
0N/A                System.out.println("\nDOCUMENT = " + doc);
143N/A
99N/A                Iterator iterator = doc.getFields().iterator();
99N/A                while (iterator.hasNext()) {
99N/A                    org.apache.lucene.document.Field field = (org.apache.lucene.document.Field) iterator.next();
143N/A                    if (field.isTokenized()) {
0N/A                        Reader r = field.readerValue();
143N/A                        if (r == null) {
0N/A                            r = new StringReader(field.stringValue());
0N/A                        }
0N/A                        TokenStream ts = fa.tokenStream(field.name(), r);
143N/A                        System.out.println("\nFIELD = " + field.name() + " TOKEN STREAM = " + ts.getClass().getName());
0N/A                        Token t;
143N/A                        while ((t = ts.next()) != null) {
0N/A                            System.out.print(t.termText());
0N/A                            System.out.print(' ');
0N/A                        }
0N/A                        System.out.println();
0N/A                    }
143N/A                    if (field.isStored()) {
0N/A                        System.out.println("\nFIELD = " + field.name());
143N/A                        if (field.readerValue() == null) {
0N/A                            System.out.println(field.stringValue());
0N/A                        } else {
0N/A                            System.out.println("STORING THE READER");
0N/A                        }
0N/A                    }
0N/A                }
0N/A                System.out.println("Writing XREF--------------");
0N/A                Writer out = new OutputStreamWriter(System.out);
0N/A                fa.writeXref(out);
0N/A                out.flush();
0N/A            } catch (Exception e) {
0N/A                System.err.println("ERROR: " + e.getMessage());
0N/A                e.printStackTrace();
0N/A            }
0N/A        }
0N/A    }
148N/A}