@Override public TokenStream create(TokenStream tokenStream) { return new KStemFilter(tokenStream); }
@Override public TokenFilter create(TokenStream input) { return new KStemFilter(input); }
@Override public TokenFilter wrapper(TokenStream input) { return new KStemFilter(input); }
public static List<String> getNgrams(String text, int N) throws IOException { List<String> tokens = new ArrayList<String>(); Reader reader = new StringReader(text); // Tokenizer //StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_46, reader); LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_46, reader); // Filters LowerCaseFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_46, tokenizer); KStemFilter kStemFilter = new KStemFilter(lowerCaseFilter); CharArraySet stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; StopFilter stopFilter = new StopFilter(Version.LUCENE_46, kStemFilter, stopwords); TokenStream ts; if(N > 1) { PositionFilter positionFilter = new PositionFilter(stopFilter); //@SuppressWarnings("resource") //ShingleFilter shingleFilter = new ShingleFilter(positionFilter, N, N); //shingleFilter.setOutputUnigrams(false); @SuppressWarnings("resource") ShingleFilter shingleFilter = new ShingleFilter(positionFilter, 2, N); shingleFilter.setOutputUnigrams(true); ts = shingleFilter; } else { ts = stopFilter; } CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = charTermAtt.toString(); if(token.length()>1) tokens.add(token); } ts.end(); ts.close(); return tokens; }