opengrok/analysis/HistoryAnalyzer.java

0N/A/*
0N/A * CDDL HEADER START
0N/A *
0N/A * The contents of this file are subject to the terms of the
407N/A * Common Development and Distribution License (the "License").
0N/A * You may not use this file except in compliance with the License.
0N/A *
0N/A * See LICENSE.txt included in this distribution for the specific
0N/A * language governing permissions and limitations under the License.
0N/A *
0N/A * When distributing Covered Code, include this CDDL HEADER in each
0N/A * file and include the License file at LICENSE.txt.
0N/A * If applicable, add the following below this CDDL HEADER, with the
0N/A * fields enclosed by brackets "[]" replaced with your own identifying
0N/A * information: Portions Copyright [yyyy] [name of copyright owner]
0N/A *
0N/A * CDDL HEADER END
0N/A */
0N/A
0N/A/*
1380N/A * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved.
0N/A */
0N/Apackage org.opensolaris.opengrok.analysis;
0N/A
0N/Aimport java.io.Reader;
0N/Aimport java.util.Set;
0N/Aimport org.apache.lucene.analysis.Analyzer;
0N/Aimport org.apache.lucene.analysis.StopFilter;
0N/Aimport org.apache.lucene.analysis.TokenStream;
0N/Aimport org.opensolaris.opengrok.analysis.plain.PlainFullTokenizer;
1318N/Aimport org.opensolaris.opengrok.search.SearchEngine;
0N/A
0N/Apublic final class HistoryAnalyzer extends Analyzer {
1185N/A    private final Set<Object> stopWords;
1190N/A
0N/A    /** An array containing some common English words that are not usually useful
0N/A    for searching. */
372N/A    private static final String[] ENGLISH_STOP_WORDS = {
0N/A        "a", "an", "and", "are", "as", "at", "be", "but", "by",
0N/A        "for", "if", "in", "into", "is", "it",
0N/A        "no", "not", "of", "on", "or", "s", "such",
0N/A        "t", "that", "the", "their", "then", "there", "these",
0N/A        "they", "this", "to", "was", "will", "with",
0N/A        "/", "\\", ":",".","0.0", "1.0"
0N/A    };
1190N/A
0N/A    /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
0N/A    public HistoryAnalyzer() {
1318N/A        stopWords = StopFilter.makeStopSet(SearchEngine.LUCENE_VERSION,ENGLISH_STOP_WORDS);
0N/A    }
1190N/A
0N/A    /** Builds an analyzer which removes words in the provided array. */
0N/A    public HistoryAnalyzer(String[] stopWords) {
1318N/A        this.stopWords = StopFilter.makeStopSet(SearchEngine.LUCENE_VERSION,stopWords);
0N/A    }
1190N/A
0N/A    /** Filters LowerCaseTokenizer with StopFilter. */
928N/A    @Override
1380N/A    public final TokenStream tokenStream(String fieldName, Reader reader) {
928N/A        //we are counting position increments, this might affect the queries later and need to be in sync, especially for highlighting of results
1318N/A        return new StopFilter(SearchEngine.LUCENE_VERSION,new PlainFullTokenizer(reader), stopWords);
0N/A    }
1380N/A
1380N/A    @Override
1380N/A    public final TokenStream reusableTokenStream(String fieldName, Reader reader) {
1380N/A        //TODO needs refactoring to get more speed and less ram usage for indexer
1380N/A        return this.tokenStream(fieldName, reader);
1380N/A    }
0N/A}