/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * See LICENSE.txt included in this distribution for the specific * language governing permissions and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at LICENSE.txt. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved. */ package org.opensolaris.opengrok.analysis; import java.io.Reader; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.opensolaris.opengrok.analysis.plain.PlainFullTokenizer; import org.opensolaris.opengrok.search.SearchEngine; public final class HistoryAnalyzer extends Analyzer { private final Set stopWords; /** An array containing some common English words that are not usually useful for searching. */ private static final String[] ENGLISH_STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "/", "\\", ":",".","0.0", "1.0" }; /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ public HistoryAnalyzer() { stopWords = StopFilter.makeStopSet(SearchEngine.LUCENE_VERSION,ENGLISH_STOP_WORDS); } /** Builds an analyzer which removes words in the provided array. */ public HistoryAnalyzer(String[] stopWords) { this.stopWords = StopFilter.makeStopSet(SearchEngine.LUCENE_VERSION,stopWords); } /** Filters LowerCaseTokenizer with StopFilter. */ @Override public final TokenStream tokenStream(String fieldName, Reader reader) { //we are counting position increments, this might affect the queries later and need to be in sync, especially for highlighting of results return new StopFilter(SearchEngine.LUCENE_VERSION,new PlainFullTokenizer(reader), stopWords); } @Override public final TokenStream reusableTokenStream(String fieldName, Reader reader) { //TODO needs refactoring to get more speed and less ram usage for indexer return this.tokenStream(fieldName, reader); } }