opengrok/search/Summarizer.java

	Summarizer.java revision 1318
595N/A/*
0N/A * Copyright 2005 The Apache Software Foundation
0N/A *
0N/A * Licensed under the Apache License, Version 2.0 (the "License");
0N/A * you may not use this file except in compliance with the License.
0N/A * You may obtain a copy of the License at
0N/A *
0N/A *     http://www.apache.org/licenses/LICENSE-2.0
0N/A *
0N/A * Unless required by applicable law or agreed to in writing, software
0N/A * distributed under the License is distributed on an "AS IS" BASIS,
0N/A * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0N/A * See the License for the specific language governing permissions and
0N/A * limitations under the License.
0N/A */
816N/A
928N/A// modified by Lubos Kosco 2010 to upgrade lucene to 3.0.0
928N/A
928N/A// TODO : rewrite this to use Highlighter from lucene contrib ...
816N/A
0N/Apackage org.opensolaris.opengrok.search;
0N/A
388N/Aimport java.io.IOException;
388N/Aimport java.io.StringReader;
388N/Aimport java.util.ArrayList;
388N/Aimport java.util.Comparator;
388N/Aimport java.util.HashSet;
421N/Aimport java.util.List;
457N/Aimport java.util.Set;
388N/Aimport java.util.SortedSet;
388N/Aimport java.util.TreeSet;
388N/Aimport org.apache.lucene.analysis.Analyzer;
388N/Aimport org.apache.lucene.analysis.Token;
388N/Aimport org.apache.lucene.analysis.TokenStream;
928N/Aimport org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
1318N/Aimport org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
388N/Aimport org.apache.lucene.index.Term;
388N/Aimport org.apache.lucene.search.BooleanClause;
388N/Aimport org.apache.lucene.search.BooleanQuery;
388N/Aimport org.apache.lucene.search.PhraseQuery;
388N/Aimport org.apache.lucene.search.PrefixQuery;
388N/Aimport org.apache.lucene.search.Query;
388N/Aimport org.apache.lucene.search.TermQuery;
388N/Aimport org.apache.lucene.search.WildcardQuery;
0N/A
0N/A
0N/A/** Implements hit summarization. */
0N/Apublic class Summarizer {
1190N/A
0N/A    /** The number of context terms to display preceding and following matches.*/
0N/A    private static final int SUM_CONTEXT = 10;
1190N/A
0N/A    /** The total number of terms to display in a summary.*/
0N/A    private static final int SUM_LENGTH = 20;
1190N/A
0N/A    /** Converts text to tokens. */
368N/A    private final Analyzer analyzer;
1190N/A
457N/A    private final Set<String> highlight = new HashSet<String>();            // put query terms in table
1190N/A
0N/A    public Summarizer(Query query, Analyzer a) {
368N/A        analyzer = a;
0N/A        getTerms(query);
0N/A    }
1190N/A
0N/A    /**
0N/A     * Class Excerpt represents a single passage found in the
0N/A     * document, with some appropriate regions highlit.
0N/A     */
344N/A    static class Excerpt {
457N/A        List<Summary.Fragment> passages = new ArrayList<Summary.Fragment>();
457N/A        Set<String> tokenSet = new TreeSet<String>();
0N/A        int numTerms = 0;
1190N/A
0N/A        /**
0N/A         */
0N/A        public void addToken(String token) {
0N/A            tokenSet.add(token);
0N/A        }
1190N/A
0N/A        /**
0N/A         * Return how many unique toks we have
0N/A         */
0N/A        public int numUniqueTokens() {
0N/A            return tokenSet.size();
0N/A        }
1190N/A
0N/A        /**
0N/A         * How many fragments we have.
0N/A         */
0N/A        public int numFragments() {
0N/A            return passages.size();
0N/A        }
1190N/A
0N/A        public void setNumTerms(int numTerms) {
0N/A            this.numTerms = numTerms;
0N/A        }
1190N/A
0N/A        public int getNumTerms() {
0N/A            return numTerms;
0N/A        }
1190N/A
0N/A        /**
0N/A         * Add a frag to the list.
0N/A         */
0N/A        public void add(Summary.Fragment fragment) {
0N/A            passages.add(fragment);
0N/A        }
1190N/A
0N/A        /**
0N/A         * Return an Enum for all the fragments
0N/A         */
421N/A        public List<Summary.Fragment> elements() {
421N/A            return passages;
0N/A        }
0N/A    }
1190N/A
0N/A    /** Returns a summary for the given pre-tokenized text. */
0N/A    public Summary getSummary(String text) throws IOException {
388N/A        if (text == null) {
388N/A            return null;
388N/A        }
0N/A        // Simplistic implementation.  Finds the first fragments in the document
0N/A        // containing any query terms.
0N/A        //
460N/A        // @TODO: check that phrases in the query are matched in the fragment
1190N/A
0N/A        Token[] tokens = getTokens(text);             // parse text to token array
1190N/A
388N/A        if (tokens.length == 0) {
0N/A            return new Summary();
388N/A        }
1190N/A
1190N/A
0N/A        //
0N/A        // Create a SortedSet that ranks excerpts according to
0N/A        // how many query terms are present.  An excerpt is
421N/A        // a List full of Fragments and Highlights
0N/A        //
14N/A        SortedSet<Excerpt> excerptSet = new TreeSet<Excerpt>(new Comparator<Excerpt>() {
928N/A            @Override
14N/A            public int compare(Excerpt excerpt1, Excerpt excerpt2) {
1250N/A                if (excerpt1 == null) {
1250N/A                    return excerpt2 == null ? 0 : -1;
1250N/A                } else if (excerpt2 == null) {
1250N/A                    return 1;
1250N/A                } else {
338N/A                    int numToks1 = excerpt1.numUniqueTokens();
338N/A                    int numToks2 = excerpt2.numUniqueTokens();
338N/A
338N/A                    if (numToks1 < numToks2) {
338N/A                        return -1;
338N/A                    } else if (numToks1 == numToks2) {
338N/A                        return excerpt1.numFragments() - excerpt2.numFragments();
338N/A                    } else {
338N/A                        return 1;
338N/A                    }
1190N/A                }
0N/A            }
1250N/A        });
1190N/A
0N/A        //
0N/A        // Iterate through all terms in the document
0N/A        //
0N/A        int lastExcerptPos = 0;
1190N/A        for (int i = 0; i < tokens.length; i++) {
0N/A            //
0N/A            // If we find a term that's in the query...
0N/A            //
1318N/A            if (highlight.contains(tokens[i].toString())) {
0N/A                //
0N/A                // Start searching at a point SUM_CONTEXT terms back,
0N/A                // and move SUM_CONTEXT terms into the future.
0N/A                //
0N/A                int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0;
0N/A                int endToken = Math.min(i+SUM_CONTEXT, tokens.length);
0N/A                int offset = tokens[startToken].startOffset();
0N/A                int j = startToken;
1190N/A
0N/A                //
0N/A                // Iterate from the start point to the finish, adding
0N/A                // terms all the way.  The end of the passage is always
0N/A                // SUM_CONTEXT beyond the last query-term.
0N/A                //
0N/A                Excerpt excerpt = new Excerpt();
0N/A                if (i != 0) {
0N/A                    excerpt.add(new Summary.Ellipsis());
0N/A                }
1190N/A
0N/A                //
0N/A                // Iterate through as long as we're before the end of
0N/A                // the document and we haven't hit the max-number-of-items
0N/A                // -in-a-summary.
0N/A                //
0N/A                while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
0N/A                    //
0N/A                    // Now grab the hit-element, if present
0N/A                    //
0N/A                    Token t = tokens[j];
1318N/A                    if (highlight.contains(t.toString())) {
1318N/A                        excerpt.addToken(t.toString());
0N/A                        excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset())));
0N/A                        excerpt.add(new Summary.Highlight(text.substring(t.startOffset(),t.endOffset())));
0N/A                        offset = t.endOffset();
0N/A                        endToken = Math.min(j+SUM_CONTEXT, tokens.length);
0N/A                    }
1190N/A
0N/A                    j++;
0N/A                }
1190N/A
0N/A                lastExcerptPos = endToken;
1190N/A
0N/A                //
0N/A                // We found the series of search-term hits and added
0N/A                // them (with intervening text) to the excerpt.  Now
0N/A                // we need to add the trailing edge of text.
0N/A                //
0N/A                // So if (j < tokens.length) then there is still trailing
0N/A                // text to add.  (We haven't hit the end of the source doc.)
0N/A                // Add the words since the last hit-term insert.
0N/A                //
0N/A                if (j < tokens.length) {
0N/A                    excerpt.add(new Summary.Fragment(text.substring(offset,tokens[j].endOffset())));
0N/A                }
1190N/A
0N/A                //
0N/A                // Remember how many terms are in this excerpt
0N/A                //
0N/A                excerpt.setNumTerms(j - startToken);
1190N/A
0N/A                //
0N/A                // Store the excerpt for later sorting
0N/A                //
0N/A                excerptSet.add(excerpt);
1190N/A
0N/A                //
0N/A                // Start SUM_CONTEXT places away.  The next
0N/A                // search for relevant excerpts begins at i-SUM_CONTEXT
0N/A                //
0N/A                i = j+SUM_CONTEXT;
0N/A            }
0N/A        }
1190N/A
0N/A        //
0N/A        // If the target text doesn't appear, then we just
0N/A        // excerpt the first SUM_LENGTH words from the document.
0N/A        //
0N/A        if (excerptSet.size() == 0) {
0N/A            Excerpt excerpt = new Excerpt();
0N/A            int excerptLen = Math.min(SUM_LENGTH, tokens.length);
0N/A            lastExcerptPos = excerptLen;
1190N/A
0N/A            excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
0N/A            excerpt.setNumTerms(excerptLen);
0N/A            excerptSet.add(excerpt);
0N/A        }
1190N/A
0N/A        //
0N/A        // Now choose the best items from the excerpt set.
0N/A        // Stop when our Summary grows too large.
0N/A        //
0N/A        double tokenCount = 0;
0N/A        Summary s = new Summary();
0N/A        while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
388N/A            Excerpt excerpt = excerptSet.last();
0N/A            excerptSet.remove(excerpt);
1190N/A
0N/A            double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
421N/A            for (Summary.Fragment f: excerpt.elements()) {
0N/A                // Don't add fragments if it takes us over the max-limit
0N/A                if (tokenCount + tokenFraction <= SUM_LENGTH) {
0N/A                    s.add(f);
0N/A                }
0N/A                tokenCount += tokenFraction;
0N/A            }
0N/A        }
1190N/A
388N/A        if (tokenCount > 0 && lastExcerptPos < tokens.length) {
0N/A            s.add(new Summary.Ellipsis());
388N/A        }
0N/A        return s;
0N/A    }
1190N/A
0N/A    private Token[] getTokens(String text) throws IOException {
938N/A        //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
928N/A        //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
14N/A        ArrayList<Token> result = new ArrayList<Token>();
368N/A        TokenStream ts = analyzer.tokenStream("full", new StringReader(text));
1318N/A        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
1185N/A        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
928N/A        while(ts.incrementToken()) {
1318N/A            Token t=new Token(term.buffer(),0,term.length(),offset.startOffset(),offset.endOffset());
928N/A            result.add(t);
1190N/A        }
388N/A        return result.toArray(new Token[result.size()]);
0N/A    }
1190N/A
1190N/A
0N/A    /**
0N/A     * Get the terms from a query and adds them to hightlite
0N/A     * a stream of tokens
0N/A     *
0N/A     * @param query
0N/A     */
1190N/A
0N/A    private void getTerms(Query query) {
388N/A        if (query instanceof BooleanQuery) {
0N/A            getBooleans((BooleanQuery) query);
388N/A        } else if (query instanceof PhraseQuery) {
0N/A            getPhrases((PhraseQuery) query);
388N/A        } else if (query instanceof WildcardQuery) {
0N/A            getWildTerm((WildcardQuery) query);
388N/A        } else if (query instanceof TermQuery) {
0N/A            getTerm((TermQuery) query);
388N/A        } else if (query instanceof PrefixQuery) {
0N/A            getPrefix((PrefixQuery) query);
388N/A        }
0N/A    }
1190N/A
0N/A    private void getBooleans(BooleanQuery query) {
0N/A        BooleanClause[] queryClauses = query.getClauses();
0N/A        for (int i = 0; i < queryClauses.length; i++) {
99N/A            if (!queryClauses[i].isProhibited()) {
99N/A                getTerms(queryClauses[i].getQuery());
99N/A            }
0N/A        }
0N/A    }
1190N/A
0N/A    private void getPhrases(PhraseQuery query) {
1190N/A        Term[] queryTerms = query.getTerms();
0N/A        for (int i = 0; i < queryTerms.length; i++) {
0N/A            highlight.add(queryTerms[i].text());
0N/A        }
0N/A    }
1190N/A
0N/A    private void getTerm(TermQuery query) {
0N/A        highlight.add(query.getTerm().text());
0N/A    }
1190N/A
0N/A    private void getWildTerm(WildcardQuery query) {
0N/A        highlight.add(query.getTerm().text());
0N/A    }
0N/A    private void getPrefix(PrefixQuery query) {
0N/A        highlight.add(query.getPrefix().text());
0N/A    }
0N/A}