@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { class SavedStreams { StandardTokenizer tokenStream; TokenStream filteredTokenStream; } SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET); streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH); return streams.filteredTokenStream; }
public void testStandardTokenizer() throws Exception { String source = "우리나라라면에서부터 일본라면이 파생되었잖니?"; source = "너는 너는 다시 내게 돌아 올거야. school is a good place 呵呵大笑 呵呵大笑"; long start = System.currentTimeMillis(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream stream = analyzer.tokenStream("s", new StringReader(source)); TokenStream tok = new StandardFilter(Version.LUCENE_36, stream); while (tok.incrementToken()) { CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class); System.out.println(new String(termAttr.buffer(), 0, termAttr.length())); } System.out.println((System.currentTimeMillis() - start) + "ms"); }
public List<String> tokenize(Analyzer analyzer, String data) { List<String> terms = Lists.newArrayList(); try { TokenStream tokens = new StandardFilter(analyzer.tokenStream(null, new StringReader(data))); tokens.reset(); while (tokens.incrementToken()) { CharTermAttribute termAttribute = tokens.getAttribute(CharTermAttribute.class); String term = trimToNull(termAttribute.toString()); if (term != null) { terms.add(term); } } tokens.end(); tokens.close(); } catch (IOException ioe) { LOG.warn("Unable to tokenize data. cause: {}", new Object[] { ioe.getMessage() }, ioe); } return terms; }
@Override protected TokenStreamComponents createComponents(String fieldName) { // Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart()); final Tokenizer src= new IKTokenizer(new StringReader(""), this.useSmart()); TokenStream tok = new StandardFilter(src); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) { super.setReader(reader); } }; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final AutocompleteTokenizer tokenizer = new AutocompleteTokenizer(reader); TokenFilter filter = new StandardFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
/** * @param fieldName ignored param * @param reader contains data to parse * @return TokenStream of ngrams */ public TokenStream tokenStream(String fieldName, Reader reader) { return new NGramTokenFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader))), min_ngram, max_ngram); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Version matchVersion = Version.LUCENE_45; final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new DiarienummerTokenFilter(result); return new TokenStreamComponents(source, result); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(LuceneConfig.USED_VERSION, reader); TokenStream result = new StandardFilter(LuceneConfig.USED_VERSION, source); TokenStreamComponents components = new TokenStreamComponents(source, result); return components; }
public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new SynonymFilter( new StopFilter(true, new LowerCaseFilter( new StandardFilter( new StandardTokenizer( Version.LUCENE_41, reader))), StopAnalyzer.ENGLISH_STOP_WORDS_SET), engine ); return result; }
@Override public TokenStream create(TokenStream tokenStream) { return new StandardFilter(tokenStream); }
@Override public StandardFilter create(TokenStream input) { return new StandardFilter(input); }
@Override protected TokenStreamComponents createComponents(String paramString) { Tokenizer source = new NGramTokenizer(n, n); TokenStream result = new StandardFilter(source); return new TokenStreamComponents(source, result); }
@Override public StandardFilter create(TokenStream input) { return new StandardFilter(luceneMatchVersion, input); }
/** Construct the rewriter */ public StdTermFilter() { dribble = new DribbleStream(); filter = new StandardFilter(new LowerCaseFilter(dribble)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final NGramWordTokenizer tokenizer = new NGramWordTokenizer(reader, 2, 3); TokenFilter filter = new StandardFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @param field ignored field name * @param reader source of tokens * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter} and {@link MorfologikFilter}. */ @Override protected TokenStreamComponents createComponents(final String field, final Reader reader) { final Tokenizer src = new StandardTokenizer(getVersion(), reader); return new TokenStreamComponents( src, new MorfologikFilter(new StandardFilter(getVersion(), src), dictionary, getVersion())); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @param field ignored field name * @param reader source of tokens * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter} and {@link MorfologikFilter}. */ @Override protected TokenStreamComponents createComponents(final String field, final Reader reader) { final Tokenizer src = new StandardTokenizer(this.version, reader); return new TokenStreamComponents( src, new MorfologikFilter(new StandardFilter(this.version, src), this.dictionary, this.version)); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @param field ignored field name * @param reader source of tokens * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter} and {@link MorfologikFilter}. */ @Override protected TokenStreamComponents createComponents(final String field, final Reader reader) { final Tokenizer src = new StandardTokenizer(this.version, reader); return new TokenStreamComponents( src, new MorfologikFilter(new StandardFilter(this.version, src), this.version)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final NGramWordTokenizer tokenizer = new NGramWordTokenizer(reader, 1, 5, true); TokenFilter filter = new StandardFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final NGramWordTokenizer tokenizer = new NGramWordTokenizer(reader, 2, 2, true); TokenFilter filter = new StandardFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final NGramWordTokenizer tokenizer = new NGramWordTokenizer(reader, 3, 3, true); TokenFilter filter = new StandardFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final NGramWordTokenizer tokenizer = new NGramWordTokenizer(reader, 5, 5, true); TokenFilter filter = new StandardFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final NGramWordTokenizer tokenizer = new NGramWordTokenizer(reader, 4, 4, true); TokenFilter filter = new StandardFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }