Summarizer.java revision 460
0N/A/**
0N/A * Copyright 2005 The Apache Software Foundation
0N/A *
0N/A * Licensed under the Apache License, Version 2.0 (the "License");
0N/A * you may not use this file except in compliance with the License.
0N/A * You may obtain a copy of the License at
0N/A *
0N/A * http://www.apache.org/licenses/LICENSE-2.0
0N/A *
0N/A * Unless required by applicable law or agreed to in writing, software
0N/A * distributed under the License is distributed on an "AS IS" BASIS,
0N/A * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0N/A * See the License for the specific language governing permissions and
0N/A * limitations under the License.
0N/A */
0N/Apackage org.opensolaris.opengrok.search;
0N/A
388N/Aimport java.io.BufferedReader;
388N/Aimport java.io.File;
388N/Aimport java.io.FileReader;
388N/Aimport java.io.IOException;
388N/Aimport java.io.StringReader;
388N/Aimport java.util.ArrayList;
388N/Aimport java.util.Comparator;
388N/Aimport java.util.Date;
388N/Aimport java.util.HashSet;
421N/Aimport java.util.List;
388N/Aimport java.util.Set;
388N/Aimport java.util.SortedSet;
388N/Aimport java.util.TreeSet;
388N/Aimport org.apache.lucene.analysis.Analyzer;
388N/Aimport org.apache.lucene.analysis.Token;
388N/Aimport org.apache.lucene.analysis.TokenStream;
388N/Aimport org.apache.lucene.index.Term;
388N/Aimport org.apache.lucene.queryParser.ParseException;
388N/Aimport org.apache.lucene.search.BooleanClause;
388N/Aimport org.apache.lucene.search.BooleanQuery;
388N/Aimport org.apache.lucene.search.PhraseQuery;
388N/Aimport org.apache.lucene.search.PrefixQuery;
388N/Aimport org.apache.lucene.search.Query;
0N/Aimport org.apache.lucene.search.TermQuery;
0N/Aimport org.apache.lucene.search.WildcardQuery;
0N/A
0N/A
0N/A/** Implements hit summarization. */
0N/Apublic class Summarizer {
0N/A
0N/A /** The number of context terms to display preceding and following matches.*/
0N/A private static final int SUM_CONTEXT = 10;
0N/A
0N/A /** The total number of terms to display in a summary.*/
0N/A private static final int SUM_LENGTH = 20;
368N/A
0N/A /** Converts text to tokens. */
14N/A private final Analyzer analyzer;
0N/A
0N/A private final Set<String> highlight = new HashSet<String>(); // put query terms in table
368N/A
0N/A public Summarizer(Query query, Analyzer a) {
0N/A analyzer = a;
0N/A getTerms(query);
0N/A }
0N/A
0N/A /**
0N/A * Class Excerpt represents a single passage found in the
344N/A * document, with some appropriate regions highlit.
421N/A */
14N/A static class Excerpt {
0N/A List<Summary.Fragment> passages = new ArrayList<Summary.Fragment>();
0N/A Set<String> tokenSet = new TreeSet<String>();
0N/A int numTerms = 0;
0N/A
0N/A /**
0N/A */
0N/A public void addToken(String token) {
0N/A tokenSet.add(token);
0N/A }
0N/A
0N/A /**
0N/A * Return how many unique toks we have
0N/A */
0N/A public int numUniqueTokens() {
0N/A return tokenSet.size();
0N/A }
0N/A
0N/A /**
0N/A * How many fragments we have.
0N/A */
0N/A public int numFragments() {
0N/A return passages.size();
0N/A }
0N/A
0N/A public void setNumTerms(int numTerms) {
0N/A this.numTerms = numTerms;
0N/A }
0N/A
0N/A public int getNumTerms() {
0N/A return numTerms;
0N/A }
0N/A
0N/A /**
0N/A * Add a frag to the list.
0N/A */
0N/A public void add(Summary.Fragment fragment) {
0N/A passages.add(fragment);
0N/A }
0N/A
0N/A /**
0N/A * Return an Enum for all the fragments
0N/A */
0N/A public List<Summary.Fragment> elements() {
0N/A return passages;
0N/A }
421N/A }
421N/A
0N/A /** Returns a summary for the given pre-tokenized text. */
0N/A public Summary getSummary(String text) throws IOException {
0N/A if (text == null) {
0N/A return null;
0N/A // Simplistic implementation. Finds the first fragments in the document
388N/A // containing any query terms.
388N/A //
388N/A // @TODO: check that phrases in the query are matched in the fragment
388N/A }
388N/A // Simplistic implementation. Finds the first fragments in the document
388N/A // containing any query terms.
388N/A //
0N/A // @TODO: check that phrases in the query are matched in the fragment
0N/A
0N/A Token[] tokens = getTokens(text); // parse text to token array
0N/A
0N/A if (tokens.length == 0) {
0N/A return new Summary();
0N/A }
388N/A
0N/A
388N/A //
0N/A // Create a SortedSet that ranks excerpts according to
0N/A // how many query terms are present. An excerpt is
0N/A // a List full of Fragments and Highlights
0N/A //
0N/A @SuppressWarnings("PMD.ConfusingTernary")
421N/A SortedSet<Excerpt> excerptSet = new TreeSet<Excerpt>(new Comparator<Excerpt>() {
0N/A public int compare(Excerpt excerpt1, Excerpt excerpt2) {
14N/A if (excerpt1 != null && excerpt2 != null) {
14N/A int numToks1 = excerpt1.numUniqueTokens();
345N/A int numToks2 = excerpt2.numUniqueTokens();
338N/A
338N/A if (numToks1 < numToks2) {
338N/A return -1;
338N/A } else if (numToks1 == numToks2) {
338N/A return excerpt1.numFragments() - excerpt2.numFragments();
338N/A } else {
338N/A return 1;
338N/A }
338N/A } else if (excerpt1 == null && excerpt2 != null) {
338N/A return -1;
345N/A } else if (excerpt1 != null && excerpt2 == null) {
345N/A return 1;
345N/A } else {
345N/A return 0;
388N/A }
345N/A }
388N/A }
345N/A );
0N/A
0N/A //
0N/A // Iterate through all terms in the document
0N/A //
0N/A int lastExcerptPos = 0;
0N/A for (int i = 0; i < tokens.length; i++) {
0N/A //
0N/A // If we find a term that's in the query...
0N/A //
0N/A if (highlight.contains(tokens[i].termText())) {
0N/A //
0N/A // Start searching at a point SUM_CONTEXT terms back,
0N/A // and move SUM_CONTEXT terms into the future.
0N/A //
0N/A int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0;
0N/A int endToken = Math.min(i+SUM_CONTEXT, tokens.length);
0N/A int offset = tokens[startToken].startOffset();
0N/A int j = startToken;
0N/A
0N/A //
0N/A // Iterate from the start point to the finish, adding
0N/A // terms all the way. The end of the passage is always
0N/A // SUM_CONTEXT beyond the last query-term.
0N/A //
0N/A Excerpt excerpt = new Excerpt();
0N/A if (i != 0) {
0N/A excerpt.add(new Summary.Ellipsis());
0N/A }
0N/A
0N/A //
0N/A // Iterate through as long as we're before the end of
0N/A // the document and we haven't hit the max-number-of-items
0N/A // -in-a-summary.
0N/A //
0N/A while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
0N/A //
0N/A // Now grab the hit-element, if present
0N/A //
0N/A Token t = tokens[j];
0N/A if (highlight.contains(t.termText())) {
0N/A excerpt.addToken(t.termText());
0N/A excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset())));
0N/A excerpt.add(new Summary.Highlight(text.substring(t.startOffset(),t.endOffset())));
0N/A offset = t.endOffset();
0N/A endToken = Math.min(j+SUM_CONTEXT, tokens.length);
0N/A }
0N/A
0N/A j++;
0N/A }
0N/A
0N/A lastExcerptPos = endToken;
0N/A
0N/A //
0N/A // We found the series of search-term hits and added
0N/A // them (with intervening text) to the excerpt. Now
0N/A // we need to add the trailing edge of text.
0N/A //
0N/A // So if (j < tokens.length) then there is still trailing
0N/A // text to add. (We haven't hit the end of the source doc.)
0N/A // Add the words since the last hit-term insert.
0N/A //
0N/A if (j < tokens.length) {
0N/A excerpt.add(new Summary.Fragment(text.substring(offset,tokens[j].endOffset())));
0N/A }
0N/A
0N/A //
0N/A // Remember how many terms are in this excerpt
0N/A //
0N/A excerpt.setNumTerms(j - startToken);
0N/A
0N/A //
0N/A // Store the excerpt for later sorting
0N/A //
0N/A excerptSet.add(excerpt);
0N/A
0N/A //
0N/A // Start SUM_CONTEXT places away. The next
0N/A // search for relevant excerpts begins at i-SUM_CONTEXT
0N/A //
0N/A i = j+SUM_CONTEXT;
0N/A }
0N/A }
0N/A
0N/A //
0N/A // If the target text doesn't appear, then we just
0N/A // excerpt the first SUM_LENGTH words from the document.
0N/A //
0N/A if (excerptSet.size() == 0) {
0N/A Excerpt excerpt = new Excerpt();
0N/A int excerptLen = Math.min(SUM_LENGTH, tokens.length);
0N/A lastExcerptPos = excerptLen;
0N/A
0N/A excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
0N/A excerpt.setNumTerms(excerptLen);
0N/A excerptSet.add(excerpt);
0N/A }
0N/A
0N/A //
0N/A // Now choose the best items from the excerpt set.
0N/A // Stop when our Summary grows too large.
0N/A //
0N/A double tokenCount = 0;
0N/A Summary s = new Summary();
0N/A while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
0N/A Excerpt excerpt = excerptSet.last();
0N/A excerptSet.remove(excerpt);
388N/A
0N/A double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
0N/A for (Summary.Fragment f: excerpt.elements()) {
0N/A // Don't add fragments if it takes us over the max-limit
421N/A if (tokenCount + tokenFraction <= SUM_LENGTH) {
0N/A s.add(f);
0N/A }
0N/A tokenCount += tokenFraction;
0N/A }
0N/A }
0N/A
0N/A if (tokenCount > 0 && lastExcerptPos < tokens.length) {
0N/A s.add(new Summary.Ellipsis());
388N/A }
0N/A return s;
388N/A }
0N/A
0N/A private Token[] getTokens(String text) throws IOException {
0N/A ArrayList<Token> result = new ArrayList<Token>();
0N/A TokenStream ts = analyzer.tokenStream("full", new StringReader(text));
14N/A for (Token token = ts.next(); token != null; token = ts.next()) {
368N/A result.add(token);
0N/A }
0N/A return result.toArray(new Token[result.size()]);
0N/A }
388N/A
0N/A
0N/A /**
0N/A * Get the terms from a query and adds them to hightlite
0N/A * a stream of tokens
0N/A *
0N/A * @param query
0N/A */
0N/A
0N/A private void getTerms(Query query) {
0N/A if (query instanceof BooleanQuery) {
0N/A getBooleans((BooleanQuery) query);
388N/A } else if (query instanceof PhraseQuery) {
0N/A getPhrases((PhraseQuery) query);
388N/A } else if (query instanceof WildcardQuery) {
0N/A getWildTerm((WildcardQuery) query);
388N/A } else if (query instanceof TermQuery) {
0N/A getTerm((TermQuery) query);
388N/A } else if (query instanceof PrefixQuery) {
0N/A getPrefix((PrefixQuery) query);
388N/A }
0N/A }
388N/A
0N/A private void getBooleans(BooleanQuery query) {
0N/A BooleanClause[] queryClauses = query.getClauses();
0N/A for (int i = 0; i < queryClauses.length; i++) {
0N/A if (!queryClauses[i].isProhibited()) {
0N/A getTerms(queryClauses[i].getQuery());
99N/A }
99N/A }
99N/A }
0N/A
0N/A private void getPhrases(PhraseQuery query) {
0N/A Term[] queryTerms = query.getTerms();
0N/A for (int i = 0; i < queryTerms.length; i++) {
320N/A highlight.add(queryTerms[i].text());
0N/A }
0N/A }
0N/A
0N/A private void getTerm(TermQuery query) {
0N/A highlight.add(query.getTerm().text());
0N/A }
0N/A
0N/A private void getWildTerm(WildcardQuery query) {
0N/A highlight.add(query.getTerm().text());
0N/A }
0N/A private void getPrefix(PrefixQuery query) {
0N/A highlight.add(query.getPrefix().text());
0N/A }
0N/A
0N/A /**
0N/A * Tests Summary-generation. User inputs the name of a
0N/A * text file and a query string
0N/A */
0N/A @SuppressWarnings("PMD.SystemPrintln")
0N/A public static void main(String argv[]) throws IOException, ParseException {
0N/A // Test arglist
0N/A if (argv.length < 2) {
0N/A System.out.println("Usage: java org.apache.nutch.searcher.Summarizer <textfile> <queryStr>");
0N/A return;
0N/A }
0N/A Analyzer a = new org.apache.lucene.analysis.standard.StandardAnalyzer();
0N/A org.apache.lucene.queryParser.QueryParser qparser = new org.apache.lucene.queryParser.QueryParser("full", a);
0N/A Query q = qparser.parse(argv[1]);
0N/A Summarizer s = new Summarizer(q, a);
0N/A
0N/A //
0N/A // Parse the args
0N/A //
0N/A File textFile = new File(argv[0]);
0N/A StringBuffer queryBuf = new StringBuffer();
0N/A for (int i = 1; i < argv.length; i++) {
0N/A queryBuf.append(argv[i]);
0N/A queryBuf.append(' ');
421N/A }
0N/A
0N/A //
0N/A // Load the text file into a single string.
0N/A //
0N/A StringBuffer body = new StringBuffer();
0N/A BufferedReader in = new BufferedReader(new FileReader(textFile));
0N/A try {
0N/A System.out.println("About to read " + textFile + " from " + in);
0N/A String str = in.readLine();
0N/A while (str != null) {
0N/A body.append(str);
0N/A str = in.readLine();
0N/A }
0N/A } finally {
0N/A in.close();
0N/A }
0N/A
0N/A // Convert the query string into a proper Query
0N/A Date d1 = new Date();
0N/A System.out.println("Summary: '" + s.getSummary(body.toString()) + "'");
0N/A System.out.println((new Date()).getTime() - d1.getTime() + " msecs ");
0N/A }
0N/A}
0N/A