/** * * @param fieldName * @param reader * @return * @throws IOException */ @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new LowerCaseTokenizer(reader); streams.result = new StopFilter(true, streams.source, stopWords, true); // streams.result = new PorterStemFilter(streams.source); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; }
@Override public final TokenStream tokenStream(String fieldName, Reader reader) { // Apply stop words and porter stemmer using a lower-case tokenizer. TokenStream stream = new StopFilter(new LowerCaseTokenizer(reader), StandardAnalyzer.STOP_WORDS); return new PorterStemFilter(stream); }
/** * Takes a gloss-like string (text) and returns it tokenized. * with: * - stopwords * - lower case * - porter stemmer */ protected Set<String> tokenizeGloss( String s ) throws IOException { Set<String> result = new HashSet<String>(); // I am affraid that I am reimplementing the StandardAnalizer... TokenStream ts = new PorterStemFilter( new StopFilter( true, new LowerCaseTokenizer( new StringReader( s ) ), stopWords, true )); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); while ( ts.incrementToken() ) { result.add( termAtt.term() ); } return result; }
public TokenStream tokenStream(String fieldName, Reader reader) { return new StopFilter(true, new LowerCaseTokenizer(reader), stopWords); }