AnalyzerGuru.java revision 32
207N/A/*
207N/A * CDDL HEADER START
207N/A *
207N/A * The contents of this file are subject to the terms of the
207N/A * Common Development and Distribution License (the "License").
207N/A * You may not use this file except in compliance with the License.
207N/A *
207N/A * See LICENSE.txt included in this distribution for the specific
207N/A * language governing permissions and limitations under the License.
207N/A *
207N/A * When distributing Covered Code, include this CDDL HEADER in each
207N/A * file and include the License file at LICENSE.txt.
207N/A * If applicable, add the following below this CDDL HEADER, with the
207N/A * fields enclosed by brackets "[]" replaced with your own identifying
207N/A * information: Portions Copyright [yyyy] [name of copyright owner]
207N/A *
207N/A * CDDL HEADER END
207N/A */
207N/A
207N/A/*
1051N/A * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
207N/A * Use is subject to license terms.
1051N/A */
207N/A
207N/A/*
207N/A * ident "@(#)AnalyzerGuru.java 1.3 06/02/22 SMI"
207N/A */
207N/Apackage org.opensolaris.opengrok.analysis;
207N/A
207N/Aimport org.opensolaris.opengrok.analysis.FileAnalyzer.Genre;
207N/Aimport org.opensolaris.opengrok.analysis.document.TroffAnalyzer;
207N/Aimport org.opensolaris.opengrok.analysis.java.JavaAnalyzer;
207N/Aimport org.opensolaris.opengrok.analysis.plain.*;
282N/Aimport org.opensolaris.opengrok.analysis.c.*;
207N/Aimport org.opensolaris.opengrok.analysis.sh.*;
261N/Aimport org.opensolaris.opengrok.analysis.data.*;
320N/Aimport java.io.*;
312N/Aimport java.util.*;
207N/Aimport java.lang.reflect.*;
207N/Aimport org.apache.lucene.document.*;
207N/Aimport org.apache.lucene.analysis.*;
207N/Aimport org.opensolaris.opengrok.analysis.archive.*;
207N/Aimport org.opensolaris.opengrok.analysis.executables.*;
207N/Aimport org.opensolaris.opengrok.history.*;
207N/Aimport org.opensolaris.opengrok.web.Util;
207N/A
207N/A/**
928N/A * Manages and porvides Analyzers as needed.
928N/A * Created on September 22, 2005
928N/A *
207N/A * @author Chandan
656N/A */
207N/Apublic class AnalyzerGuru {
207N/A private static HashMap<String, Class> ext;
207N/A private static SortedMap<String, Class> magics;
207N/A private static ArrayList<Method> matchers;
678N/A /*
480N/A * If you write your own analyzer please register it here
207N/A */
207N/A private static Class[] analyzers = {
207N/A IgnorantAnalyzer.class,
207N/A BZip2Analyzer.class,
207N/A FileAnalyzer.class,
207N/A XMLAnalyzer.class,
207N/A TroffAnalyzer.class,
928N/A ELFAnalyzer.class,
207N/A JavaClassAnalyzer.class,
207N/A ImageAnalyzer.class,
207N/A JarAnalyzer.class,
207N/A ZipAnalyzer.class,
207N/A TarAnalyzer.class,
207N/A CAnalyzer.class,
207N/A ShAnalyzer.class,
207N/A PlainAnalyzer.class,
207N/A GZIPAnalyzer.class,
1026N/A JavaAnalyzer.class
207N/A };
207N/A private static HashMap<Class, FileAnalyzer> analyzerInstances = new HashMap<Class, FileAnalyzer>();
207N/A
207N/A /**
253N/A * Initializes an AnalyzerGuru
359N/A */
207N/A static {
359N/A if (ext == null) {
274N/A ext = new HashMap<String, Class>();
320N/A }
656N/A if (magics == null) {
928N/A magics = new TreeMap<String, Class>();
656N/A // TODO: have a comparator
207N/A }
207N/A if (matchers == null) {
207N/A matchers = new ArrayList<Method>();
207N/A }
207N/A for (Class analyzer: analyzers) {
207N/A try{
207N/A String[] suffixes = (String[]) analyzer.getField("suffixes").get(null);
928N/A for (String suffix: suffixes) {
207N/A //System.err.println(analyzer.getSimpleName() + " = " + suffix);
207N/A ext.put(suffix, analyzer);
207N/A }
207N/A } catch (Exception e) {
207N/A // System.err.println("AnalyzerFinder:" + analyzer.getSimpleName() + e);
207N/A }
207N/A try{
928N/A String[] smagics = (String[]) analyzer.getField("magics").get(null);
207N/A for (String magic: smagics) {
928N/A //System.err.println(analyzer.getSimpleName() + " = " + magic);
207N/A magics.put(magic, analyzer);
207N/A }
207N/A } catch (Exception e) {
207N/A // System.err.println("AnalyzerFinder: " + analyzer.getSimpleName() + e);
207N/A }
207N/A try{
261N/A Method m = analyzer.getMethod("isMagic", byte[].class);
459N/A if (m != null) matchers.add(m);
207N/A } catch (Exception e) {
459N/A }
261N/A }
207N/A //System.err.println("Exts " + ext);
207N/A //System.err.println("Matchers " + matchers);
207N/A }
207N/A
261N/A /*
207N/A * Get the default Analyzer.
459N/A */
207N/A public static FileAnalyzer getAnalyzer() {
312N/A
207N/A Class a = FileAnalyzer.class;
564N/A FileAnalyzer fa = analyzerInstances.get(a);
564N/A if (fa == null) {
207N/A try {
207N/A fa = (FileAnalyzer) a.newInstance();
564N/A analyzerInstances.put(a, fa);
207N/A return fa;
207N/A } catch (Exception e) {
564N/A System.err.println("ERROR: Initializing " + a);
564N/A }
564N/A }
564N/A return fa;
564N/A }
207N/A
207N/A /*
207N/A * use this if you want to analyze a file. Analyzers are costly.
261N/A */
261N/A public static FileAnalyzer getAnalyzer(InputStream in, String path) throws IOException {
261N/A Class a = find(in, path);
1054N/A if(a == null) {
261N/A a = FileAnalyzer.class;
261N/A }
261N/A if (a != null) {
261N/A FileAnalyzer fa = analyzerInstances.get(a);
320N/A if (fa == null) {
261N/A try {
261N/A fa = (FileAnalyzer) a.newInstance();
261N/A analyzerInstances.put(a, fa);
207N/A return fa;
207N/A } catch (Exception e) {
207N/A System.err.println("ERROR: Initializing " + a);
668N/A }
668N/A } else {
668N/A return fa;
668N/A }
668N/A }
668N/A return null;
668N/A }
668N/A
668N/A public Document getDocument(File f, InputStream in, String path) throws IOException {
668N/A Document doc = new Document();
668N/A String date = DateField.timeToString(f.lastModified());
668N/A doc.add(new org.apache.lucene.document.Field("u", Util.uid(path, date), false, true, false));
668N/A doc.add(new org.apache.lucene.document.Field("fullpath", f.getAbsolutePath(), true, true, true));
668N/A try{
1054N/A HistoryReader hr = HistoryGuru.getInstance().getHistoryReader(f);
668N/A if (hr != null) {
668N/A doc.add(org.apache.lucene.document.Field.Text("hist", hr));
668N/A // date = hr.getLastCommentDate() //RFE
668N/A }
668N/A } catch (IOException e) {
668N/A }
668N/A doc.add(org.apache.lucene.document.Field.Keyword("date", date));
668N/A if(path != null) {
668N/A doc.add(new org.apache.lucene.document.Field("path", path, true, true, true));
668N/A }
668N/A FileAnalyzer fa = null;
668N/A try {
668N/A fa = getAnalyzer(in, path);
668N/A } catch (Exception e) {
668N/A
668N/A }
668N/A if (fa != null) {
668N/A try {
668N/A Genre g = fa.getGenre();
668N/A if (g == Genre.PLAIN) {
1054N/A doc.add(new org.apache.lucene.document.Field("t", "p", true, false, false));
668N/A } else if ( g == Genre.XREFABLE) {
668N/A doc.add(new org.apache.lucene.document.Field("t", "x", true, false, false));
668N/A } else if ( g == Genre.HTML) {
668N/A doc.add(new org.apache.lucene.document.Field("t", "h", true, false, false));
668N/A }
668N/A fa.analyze(doc, in);
668N/A } catch (Exception e) {
668N/A // Ignoring any errors while analysing
668N/A }
668N/A }
668N/A doc.removeField("fullpath");
1054N/A return doc;
668N/A }
668N/A
668N/A /**
668N/A * @return The contentType suitable for printing to response.setContentType()
668N/A */
668N/A public static String getContentType(String path) {
668N/A Class a = find(path);
668N/A return getContentType(a);
668N/A }
668N/A
668N/A public static String getContentType(InputStream in, String path) throws IOException {
668N/A Class a = find(in, path);
460N/A return getContentType(a);
580N/A }
580N/A
580N/A public static String getContentType(Class a) {
580N/A String contentType = null;
580N/A if (a != null) {
580N/A try {
580N/A contentType = (String) a.getMethod("getContentType").invoke(null);
580N/A } catch (Exception e ) {
580N/A
580N/A }
580N/A }
580N/A return contentType;
580N/A }
580N/A
580N/A public static void writeXref(Class a, InputStream in, Writer out) throws IOException {
580N/A if (a != null) {
207N/A try {
580N/A a.getMethod("writeXref", InputStream.class, Writer.class).invoke(null, in, out);
580N/A } catch (IllegalArgumentException ex) {
580N/A } catch (SecurityException ex) {
580N/A } catch (NoSuchMethodException ex) {
580N/A } catch (InvocationTargetException ex) {
928N/A } catch (IllegalAccessException ex) {
580N/A }
928N/A }
207N/A }
928N/A
928N/A /**
580N/A * @return The genre suitable to decide how to display the file
1026N/A */
580N/A public static Genre getGenre(String path) {
580N/A Class a = find(path);
580N/A return getGenre(a);
580N/A }
580N/A
580N/A public static Genre getGenre(InputStream in, String path) throws IOException {
580N/A Class a = find(in, path);
580N/A return getGenre(a);
359N/A }
207N/A
207N/A public static Genre getGenre(InputStream in) throws IOException {
207N/A Class a = find(in);
274N/A return getGenre(a);
274N/A }
274N/A
274N/A public static Genre getGenre(Class a) {
274N/A Genre g = null;
297N/A if (a != null) {
274N/A try {
464N/A g = (Genre) a.getField("g").get(null);
274N/A } catch (Exception e ) {
439N/A e.printStackTrace();
439N/A }
439N/A }
464N/A return g;
439N/A }
297N/A
439N/A /**
460N/A * Finds a suitable analyser class for an InputStream and a file name
439N/A * Use if you just want to find file type.
274N/A */
460N/A public static Class find(InputStream in, String path) throws IOException {
460N/A Class a = find(path);
274N/A if(a == null) {
274N/A a = find(in);
274N/A }
274N/A return a;
207N/A }
459N/A
678N/A public static Class find(String path) {
207N/A int i = 0;
678N/A if ((i = path.lastIndexOf('/')) > 0 || (i = path.lastIndexOf('\\')) > 0) {
359N/A if(i+1<path.length())
359N/A path = path.substring(i+1);
459N/A }
359N/A path = path.toUpperCase();
359N/A int dotpos = path.lastIndexOf('.');
359N/A if(dotpos >= 0) {
359N/A Class analyzer = ext.get(path.substring(dotpos+1).toUpperCase());
656N/A if (analyzer != null) {
694N/A //System.err.println(path.substring(dotpos+1).toUpperCase() + " = " + analyzer.getSimpleName());
694N/A return analyzer;
656N/A }
694N/A }
656N/A return(ext.get(path));
656N/A }
656N/A
656N/A public static Class find(InputStream in) throws IOException {
656N/A in.mark(8);
207N/A byte[] content = new byte[8];
816N/A int len = in.read(content);
816N/A in.reset();
255N/A if (len < 4)
207N/A return null;
456N/A Class a = find(content);
460N/A if(a == null) {
460N/A for(Method matcher: matchers) {
460N/A try {
274N/A //System.out.println("USING = " + matcher.getName());
274N/A if ((a = (Class) matcher.invoke(null, content))!= null) {
207N/A return a;
651N/A }
274N/A } catch (Exception e ) {
274N/A e.printStackTrace();
456N/A }
274N/A }
274N/A }
274N/A return a;
274N/A }
274N/A
651N/A public static Class find(byte[] content) {
651N/A char[] chars = new char[content.length > 8 ? 8 : content.length];
274N/A for (int i = 0; i< chars.length ; i++) {
972N/A chars[i] = (char)(0xFF & content[i]);
457N/A }
457N/A return(findMagic(new String(chars)));
207N/A }
1112N/A
1108N/A public static Class findMagic(String content) {
1114N/A Class a = magics.get(content);
1114N/A if (a == null) {
1114N/A for(String magic: magics.keySet()) {
1114N/A if(content.startsWith(magic)) {
1114N/A return magics.get(magic);
1114N/A }
1112N/A }
1108N/A }
1108N/A return a;
207N/A }
457N/A
457N/A public static void main(String [] args) throws Exception {
457N/A AnalyzerGuru af = new AnalyzerGuru();
457N/A System.out.println("<pre wrap=true>");
457N/A for(String arg: args) {
457N/A try {
274N/A Class an = af.find(arg);
207N/A File f = new File(arg);
207N/A BufferedInputStream in = new BufferedInputStream(new FileInputStream(f));
207N/A FileAnalyzer fa = af.getAnalyzer(in, arg);
1054N/A System.out.println("\nANALYZER = " + fa);
207N/A Document doc = af.getDocument(f, in, arg);
207N/A System.out.println("\nDOCUMENT = " + doc);
508N/A Enumeration fields = doc.fields();
207N/A while (fields.hasMoreElements()) {
207N/A org.apache.lucene.document.Field field = (org.apache.lucene.document.Field) fields.nextElement();
656N/A if(field.isTokenized()){
656N/A Reader r = field.readerValue();
656N/A if(r == null) {
656N/A r = new StringReader(field.stringValue());
656N/A }
656N/A TokenStream ts = fa.tokenStream(field.name(), r);
656N/A System.out.println("\nFIELD = " + field.name() + " TOKEN STREAM = "+ ts.getClass().getName());
656N/A Token t;
656N/A while((t = ts.next()) != null) {
359N/A System.out.print(t.termText());
359N/A System.out.print(' ');
359N/A }
207N/A System.out.println();
207N/A }
359N/A if(field.isStored()) {
253N/A System.out.println("\nFIELD = " + field.name());
253N/A if(field.readerValue() == null) {
253N/A System.out.println(field.stringValue());
207N/A } else {
667N/A System.out.println("STORING THE READER");
667N/A }
667N/A }
672N/A }
1054N/A System.out.println("Writing XREF--------------");
672N/A Writer out = new OutputStreamWriter(System.out);
667N/A fa.writeXref(out);
672N/A out.flush();
1054N/A } catch (Exception e) {
672N/A System.err.println("ERROR: " + e.getMessage());
667N/A e.printStackTrace();
207N/A }
207N/A }
207N/A }
207N/A}
270N/A