Summarizer.java revision 368
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Implements hit summarization. */
public class Summarizer {
/** The number of context terms to display preceding and following matches.*/
private static final int SUM_CONTEXT = 10;
/** The total number of terms to display in a summary.*/
private static final int SUM_LENGTH = 20;
/** Converts text to tokens. */
analyzer = a;
}
/**
* Class Excerpt represents a single passage found in the
* document, with some appropriate regions highlit.
*/
static class Excerpt {
int numTerms = 0;
/**
*/
public Excerpt() {
}
/**
*/
}
/**
* Return how many unique toks we have
*/
public int numUniqueTokens() {
}
/**
* How many fragments we have.
*/
public int numFragments() {
}
public void setNumTerms(int numTerms) {
}
public int getNumTerms() {
return numTerms;
}
/**
* Add a frag to the list.
*/
}
/**
* Return an Enum for all the fragments
*/
public Enumeration elements() {
}
}
/** Returns a summary for the given pre-tokenized text. */
// Simplistic implementation. Finds the first fragments in the document
// containing any query terms.
//
// TODO: check that phrases in the query are matched in the fragment
return new Summary();
//
// Create a SortedSet that ranks excerpts according to
// how many query terms are present. An excerpt is
// a Vector full of Fragments and Highlights
//
return -1;
} else {
return 1;
}
return -1;
return 1;
} else
return 0;
}
}
);
//
// Iterate through all terms in the document
//
int lastExcerptPos = 0;
//
// If we find a term that's in the query...
//
//
// Start searching at a point SUM_CONTEXT terms back,
// and move SUM_CONTEXT terms into the future.
//
int j = startToken;
//
// Iterate from the start point to the finish, adding
// terms all the way. The end of the passage is always
// SUM_CONTEXT beyond the last query-term.
//
if (i != 0) {
}
//
// Iterate through as long as we're before the end of
// the document and we haven't hit the max-number-of-items
// -in-a-summary.
//
//
// Now grab the hit-element, if present
//
}
j++;
}
//
// We found the series of search-term hits and added
// them (with intervening text) to the excerpt. Now
// we need to add the trailing edge of text.
//
// So if (j < tokens.length) then there is still trailing
// text to add. (We haven't hit the end of the source doc.)
// Add the words since the last hit-term insert.
//
}
//
// Remember how many terms are in this excerpt
//
//
// Store the excerpt for later sorting
//
//
// Start SUM_CONTEXT places away. The next
// search for relevant excerpts begins at i-SUM_CONTEXT
//
i = j+SUM_CONTEXT;
}
}
//
// If the target text doesn't appear, then we just
// excerpt the first SUM_LENGTH words from the document.
//
excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
}
//
// Now choose the best items from the excerpt set.
// Stop when our Summary grows too large.
//
double tokenCount = 0;
// Don't add fragments if it takes us over the max-limit
s.add(f);
}
}
}
return s;
}
}
}
/**
* Get the terms from a query and adds them to hightlite
* a stream of tokens
*
* @param query
*/
if (query instanceof BooleanQuery)
else if (query instanceof PhraseQuery)
else if (query instanceof WildcardQuery)
else if (query instanceof PrefixQuery)
}
if (!queryClauses[i].isProhibited()) {
}
}
}
}
}
}
}
}
/**
* Tests Summary-generation. User inputs the name of a
* text file and a query string
*/
// Test arglist
return;
}
org.apache.lucene.queryParser.QueryParser qparser = new org.apache.lucene.queryParser.QueryParser("full", a);
Summarizer s = new Summarizer(q, a);
//
// Parse the args
//
}
//
// Load the text file into a single string.
//
try {
}
} finally {
}
// Convert the query string into a proper Query
}
}