Java 类org.apache.lucene.analysis.StopFilter 实例源码

项目:elasticsearch_my    文件:StopTokenFilterTests.java   
public void testCorrectPositionIncrementSetting() throws IOException {
    Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop");
    if (random().nextBoolean()) {
        builder.put("index.analysis.filter.my_stop.version", Version.LATEST);
    } else {
        // don't specify
    }
    builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString());
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop");
    assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class));
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("foo bar"));
    TokenStream create = tokenFilter.create(tokenizer);
    assertThat(create, instanceOf(StopFilter.class));
}
项目:lucene-bo    文件:TibetanAnalyzer.java   
/**
 * Creates a new {@link TibetanAnalyzer}
 * 
 * @param  segmentInWords  if the segmentation is on words instead of syllables
 * @param  lemmatize  if the analyzer should remove affixed particles, and normalize words in words mode
 * @param  filterChars  if the text should be converted to NFD (necessary for texts containing NFC strings)
 * @param  inputMethod  if the text should be converted from EWTS to Unicode
 * @param  stopFilename  a file name with a stop word list
 * @throws IOException  if the file containing stopwords can't be opened 
 */
public TibetanAnalyzer(boolean segmentInWords, boolean lemmatize, boolean filterChars, String inputMethod, String stopFilename) throws IOException {
    this.segmentInWords = segmentInWords;
    this.lemmatize = lemmatize;
    this.filterChars = filterChars;
    this.inputMethod = inputMethod;
    if (stopFilename != null) {
        if (stopFilename.isEmpty()) {
            InputStream stream = null;
            stream = TibetanAnalyzer.class.getResourceAsStream("/bo-stopwords.txt");
            if (stream == null) {      // we're not using the jar, there is no resource, assuming we're running the code
                this.tibStopSet = null;
            } else {
                this.tibStopSet = StopFilter.makeStopSet(getWordList(stream, "#"));
            }
        } else {
            this.tibStopSet = StopFilter.makeStopSet(getWordList(new FileInputStream(stopFilename), "#"));
        }
    } else {
        this.tibStopSet = null;
    }
}
项目:subsonic    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:FutureSonic-Server    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:madsonic-server-5.1    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:madsonic-server-5.0    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:dash-xtf    文件:LuceneIndexToDict.java   
/**
 * Read a Lucene index and make a spelling dictionary from it. A minimal token
 * analyzer will be used, which is usually just what is needed for the
 * dictionary. The default set of English stop words will be used (see
 * {@link StopAnalyzer#ENGLISH_STOP_WORDS}).
 * 
 * @param indexDir directory containing the Lucene index
 * @param dictDir directory to receive the spelling dictionary
 * @param prog tracker called periodically to display progress
 */
public static void createDict(Directory indexDir, 
                              File dictDir, 
                              ProgressTracker prog)
  throws IOException
{
  // Open and clear the dictionary (since we're going to totally rebuild it)
  SpellWriter spellWriter = SpellWriter.open(dictDir);
  spellWriter.clearDictionary();
  spellWriter.setStopwords(StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));

  // Now re-tokenize all the fields and queue the words for the dictionary.
  IndexReader indexReader = IndexReader.open(indexDir);
  createDict(indexReader, new MinimalAnalyzer(), spellWriter, prog);

  // All done.
  spellWriter.close();
}
项目:DrakkarKeel    文件:StopStemAnalyzerCaseSensitive.java   
/**
 *
 * @param fieldName
 * @param reader
 * @return
 * @throws IOException
 */
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws
        IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        streams.source = new WhitespaceTokenizer(reader);
        streams.result = new StopFilter(true, streams.source, stopwords, true);
        streams.result = new PorterStemFilter(streams.source);

        setPreviousTokenStream(streams);
    } else {
        streams.source.reset(reader);
    }
    return streams.result;
}
项目:DrakkarKeel    文件:DefaultAnalyzer.java   
/**
     *
     * @param fieldName
     * @param reader
     * @return
     * @throws IOException
     */
    @Override
    public TokenStream reusableTokenStream(String fieldName, Reader reader) throws
            IOException {
        SavedStreams streams = (SavedStreams) getPreviousTokenStream();
        if (streams == null) {
            streams = new SavedStreams();
            streams.source = new LowerCaseTokenizer(reader);
            streams.result = new StopFilter(true, streams.source, stopWords, true);
//            streams.result = new PorterStemFilter(streams.source);

            setPreviousTokenStream(streams);
        } else {
            streams.source.reset(reader);
        }
        return streams.result;
    }
项目:DrakkarKeel    文件:DefaultCaseSensitiveAnalyzer.java   
/**
     *
     * @param fieldName
     * @param reader
     * @return
     * @throws IOException
     */
    @Override
    public TokenStream reusableTokenStream(String fieldName, Reader reader) throws
            IOException {
        SavedStreams streams = (SavedStreams) getPreviousTokenStream();
        if (streams == null) {
            streams = new SavedStreams();
            streams.source = new WhitespaceTokenizer(reader);
            streams.result = new StopFilter(true, streams.source, stopWords, true);
//            streams.result = new PorterStemFilter(streams.source);

            setPreviousTokenStream(streams);
        } else {
            streams.source.reset(reader);
        }
        return streams.result;
    }
项目:madsonic-server-5.0    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:ansj-seg-for-lucene3    文件:AnsjAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
    Analysis in;
    try {
        in = analysis.getConstructor(Reader.class).newInstance(reader);
    } catch (Exception e) {
        throw new RuntimeException("Ansj analysis can't be instance!");
    }

    final Tokenizer source = new AnsjTokenizer(reader,in);

    TokenStreamComponents result;
    if (stopwords.isEmpty()) {
        result = new TokenStreamComponents(source);
    } else {
        result = new TokenStreamComponents(source,new StopFilter(matchVersion, source, stopwords));
    }

    return result;
}
项目:elasticsearch_my    文件:StopTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    if (removeTrailing) {
        return new StopFilter(tokenStream, stopWords);
    } else {
        return new SuggestStopFilter(tokenStream, stopWords);
    }
}
项目:elasticsearch_my    文件:ShingleTokenFilterFactoryTests.java   
public void testFillerToken() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_filler");
    String source = "simon the sorcerer";
    String[] expected = new String[]{"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the"));
    assertTokenStreamContents(tokenFilter.create(stream), expected);
}
项目:elasticsearch_my    文件:JapaneseStopTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    if (removeTrailing) {
        return new StopFilter(tokenStream, stopWords);
    } else {
        return new SuggestStopFilter(tokenStream, stopWords);
    }
}
项目:lucene-bo    文件:TibetanAnalyzerTest.java   
@Test
public void stopwordFilterTest() throws IOException
{
    System.out.println("Testing TibetanAnalyzer.tibStopWords");
    String input = "ཧ་ཏུ་གི་ཀྱི་གིས་ཀྱིས་ཡིས་ཀྱང་སྟེ་ཏེ་མམ་རམ་སམ་ཏམ་ནོ་བོ་ཏོ་གིན་ཀྱིན་གྱིན་ཅིང་ཅིག་ཅེས་ཞེས་ཧ།";
    Reader reader = new StringReader(input);
    List<String> expected = Arrays.asList("ཧ", "ཧ");

    System.out.print(input + " => ");
    TokenStream syllables = tokenize(reader, new TibSyllableTokenizer());
    CharArraySet stopSet = StopFilter.makeStopSet(TibetanAnalyzer.getWordList(new FileInputStream("src/main/resources/bo-stopwords.txt"), "#"));
    StopFilter res = new StopFilter(syllables, stopSet);
    assertTokenStream(res, expected);
}
项目:elasticsearch-analysis-benz    文件:AnalysisBenzPlugin.java   
@Override
public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
    return Collections.singletonMap("benz_cjk", (indexSettings, environment, name, settings) -> new AbstractTokenFilterFactory(indexSettings, name, settings) {

        @Override
        public TokenStream create(TokenStream tokenStream) {
            return new StopFilter(tokenStream, config.getStopWords());
        }
    });
}
项目:elasticsearch-analysis-ja    文件:JapaneseStopTokenFilterFactory.java   
@Override
public TokenStream create(final TokenStream tokenStream) {
    if (removeTrailing) {
        return new StopFilter(tokenStream, stopWords);
    } else {
        return new SuggestStopFilter(tokenStream, stopWords);
    }
}
项目:Openfire    文件:WordMatchRouter.java   
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
    // Apply stop words and porter stemmer using a lower-case tokenizer.
    TokenStream stream = new StopFilter(new LowerCaseTokenizer(reader),
        StandardAnalyzer.STOP_WORDS);
    return new PorterStemFilter(stream);
}
项目:g3server    文件:WordMatchRouter.java   
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
          // Apply stop words and porter stemmer using a lower-case tokenizer.
          TokenStream stream = new StopFilter(new LowerCaseTokenizer(reader),
              StandardAnalyzer.STOP_WORDS);
          return new PorterStemFilter(stream);
      }
项目:openfire    文件:WordMatchRouter.java   
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
          // Apply stop words and porter stemmer using a lower-case tokenizer.
          TokenStream stream = new StopFilter(new LowerCaseTokenizer(reader),
              StandardAnalyzer.STOP_WORDS);
          return new PorterStemFilter(stream);
      }
项目:align-api-project    文件:JWNLDistances.java   
/**
    * Takes a gloss-like string (text) and returns it tokenized.
    * with:
    * - stopwords
    * - lower case
    * - porter stemmer
    */
   protected Set<String> tokenizeGloss( String s ) throws IOException {
Set<String> result = new HashSet<String>();
// I am affraid that I am reimplementing the StandardAnalizer...
TokenStream ts = new PorterStemFilter(
            new StopFilter( true, 
                                      new LowerCaseTokenizer( 
                                             new StringReader( s ) ), stopWords, true ));
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
while ( ts.incrementToken() ) {
    result.add( termAtt.term() );
}
return result;
   }
项目:ansj-seg-for-lucene3    文件:AnsjIndexAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
    final Tokenizer source = new AnsjTokenizer(reader,new IndexAnalysis(reader));

    TokenStreamComponents result;
    if (stopwords.isEmpty()) {
        result = new TokenStreamComponents(source);
    } else {
        result = new TokenStreamComponents(source,new StopFilter(matchVersion, source, stopwords));
    }

    return result;
}
项目:ansj-seg-for-lucene3    文件:AnsjSearchAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
    final Tokenizer source = new AnsjTokenizer(reader,new ToAnalysis(reader));

    TokenStreamComponents result;
    if (stopwords.isEmpty()) {
        result = new TokenStreamComponents(source);
    } else {
        result = new TokenStreamComponents(source,new StopFilter(matchVersion, source, stopwords));
    }

    return result;
}
项目:t4f-data    文件:SynonymAnalyzer.java   
public TokenStream tokenStream(String fieldName, Reader reader) {
  TokenStream result = new SynonymFilter(
                        new StopFilter(true,
                          new LowerCaseFilter(
                            new StandardFilter(
                              new StandardTokenizer(
                               Version.LUCENE_41, reader))),
                          StopAnalyzer.ENGLISH_STOP_WORDS_SET),
                        engine
                       );
  return result;
}
项目:openfire-bespoke    文件:WordMatchRouter.java   
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
          // Apply stop words and porter stemmer using a lower-case tokenizer.
          TokenStream stream = new StopFilter(new LowerCaseTokenizer(reader),
              StandardAnalyzer.STOP_WORDS);
          return new PorterStemFilter(stream);
      }
项目:elasticsearch_my    文件:MyFilterTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    return new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
项目:ExpertFinder    文件:GermanAnalyzer.java   
/**
 * Builds an analyzer with the given stop words.
 */
public GermanAnalyzer(String[] stopwords) {
  stopSet = StopFilter.makeStopSet(stopwords);
}
项目:ExpertFinder    文件:GermanAnalyzer.java   
/**
 * Builds an exclusionlist from an array of Strings.
 */
public void setStemExclusionTable(String[] exclusionlist) {
  exclusionSet = StopFilter.makeStopSet(exclusionlist);
}
项目:community-edition-old    文件:AlfrescoStandardAnalyser.java   
/** Builds an analyzer with the given stop words. */
public AlfrescoStandardAnalyser(String[] stopWords)
{
    stopSet = StopFilter.makeStopSet(stopWords);
}
项目:dash-xtf    文件:SpellWritingAnalyzer.java   
/**
 * Builds an analyzer which writes to the given spelling dictionary, using the
 * given stop words.
 */
public SpellWritingAnalyzer(String[] stopWords, SpellWriter spellWriter)
{
  this(StopFilter.makeStopSet(stopWords), spellWriter);
}
项目:CadalWorkspace    文件:CJKAnalyzer.java   
/**
 * Builds an analyzer which removes words in {@link #STOP_WORDS}.
 */
public CJKAnalyzer() {
  stopTable = StopFilter.makeStopSet(STOP_WORDS);
}
项目:t4f-data    文件:StopAnalyzer2.java   
public StopAnalyzer2(String[] stopWords) {
  this.stopWords = StopFilter.makeStopSet(stopWords);
}
项目:t4f-data    文件:StopAnalyzer2.java   
public TokenStream tokenStream(String fieldName, Reader reader) {
  return new StopFilter(true, new LowerCaseFilter(new LetterTokenizer(reader)),
      stopWords);
}
项目:t4f-data    文件:StopAnalyzer1.java   
public StopAnalyzer1(String[] stopWords) {
  this.stopWords = StopFilter.makeStopSet(stopWords);
}
项目:t4f-data    文件:StopAnalyzer1.java   
public TokenStream tokenStream(String fieldName, Reader reader) {
  return new StopFilter(true,
                        new LowerCaseTokenizer(reader), 
                        stopWords);
}
项目:t4f-data    文件:StopAnalyzerFlawed.java   
public StopAnalyzerFlawed(String[] stopWords) {
  this.stopWords = StopFilter.makeStopSet(stopWords);
}
项目:t4f-data    文件:StopAnalyzerFlawed.java   
/**
 * Ordering mistake here
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
  return new LowerCaseFilter(
         new StopFilter(true, new LetterTokenizer(reader),
                        stopWords));
}
项目:ExpertFinder    文件:GermanAnalyzer.java   
/**
 * Builds an analyzer with the default stop words
 * (<code>GERMAN_STOP_WORDS</code>).
 */
public GermanAnalyzer() {
  stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
}
项目:CadalWorkspace    文件:CJKAnalyzer.java   
/**
 * Builds an analyzer which removes words in the provided array.
 *
 * @param stopWords stop word array
 */
public CJKAnalyzer(String[] stopWords) {
  stopTable = StopFilter.makeStopSet(stopWords);
}