/*
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// modified by Lubos Kosco 2010 to upgrade lucene to 3.0.0
// TODO : rewrite this to use Highlighter from lucene contrib ...
/** Implements hit summarization. */
public class Summarizer {
/** The number of context terms to display preceding and following matches.*/
/** The total number of terms to display in a summary.*/
/** Converts text to tokens. */
analyzer = a;
}
/**
* Class Excerpt represents a single passage found in the
* document, with some appropriate regions highlit.
*/
static class Excerpt {
/**
*/
}
/**
* Return how many unique toks we have
*/
public int numUniqueTokens() {
}
/**
* How many fragments we have.
*/
public int numFragments() {
}
}
public int getNumTerms() {
return numTerms;
}
/**
* Add a frag to the list.
*/
}
/**
* Return an Enum for all the fragments
*/
return passages;
}
}
/** Returns a summary for the given pre-tokenized text. */
return null;
}
// Simplistic implementation. Finds the first fragments in the document
// containing any query terms.
//
// @TODO: check that phrases in the query are matched in the fragment
return new Summary();
}
//
// Create a SortedSet that ranks excerpts according to
// how many query terms are present. An excerpt is
// a List full of Fragments and Highlights
//
return 1;
} else {
return -1;
} else {
return 1;
}
}
}
});
//
// Iterate through all terms in the document
//
int lastExcerptPos = 0;
//
// If we find a term that's in the query...
//
//
// Start searching at a point SUM_CONTEXT terms back,
// and move SUM_CONTEXT terms into the future.
//
int j = startToken;
//
// Iterate from the start point to the finish, adding
// terms all the way. The end of the passage is always
// SUM_CONTEXT beyond the last query-term.
//
if (i != 0) {
}
//
// Iterate through as long as we're before the end of
// the document and we haven't hit the max-number-of-items
// -in-a-summary.
//
//
// Now grab the hit-element, if present
//
}
j++;
}
//
// We found the series of search-term hits and added
// them (with intervening text) to the excerpt. Now
// we need to add the trailing edge of text.
//
// So if (j < tokens.length) then there is still trailing
// text to add. (We haven't hit the end of the source doc.)
// Add the words since the last hit-term insert.
//
}
//
// Remember how many terms are in this excerpt
//
//
// Store the excerpt for later sorting
//
//
// Start SUM_CONTEXT places away. The next
// search for relevant excerpts begins at i-SUM_CONTEXT
//
i = j+SUM_CONTEXT;
}
}
//
// If the target text doesn't appear, then we just
// excerpt the first SUM_LENGTH words from the document.
//
excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
}
//
// Now choose the best items from the excerpt set.
// Stop when our Summary grows too large.
//
double tokenCount = 0;
// Don't add fragments if it takes us over the max-limit
s.add(f);
}
}
}
}
return s;
}
//FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
//also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
while(ts.incrementToken()) {
}
}
/**
* Get the terms from a query and adds them to hightlite
* a stream of tokens
*
* @param query
*/
if (query instanceof BooleanQuery) {
} else if (query instanceof PhraseQuery) {
} else if (query instanceof WildcardQuery) {
} else if (query instanceof PrefixQuery) {
}
}
if (!queryClauses[i].isProhibited()) {
}
}
}
}
}
}
}
}
}