@Override public List<Annotation> annotate(String text) throws Exception { text = SimpleTokenizer.format(text); Analyzer analyser = new EnglishAnalyzer(Version.LUCENE_47, CharArraySet.EMPTY_SET); TokenFilter filter = new EnglishMinimalStemFilter(analyser.tokenStream("text", new StringReader(text))); List<Annotation> out = Lists.newArrayList(); while (filter.incrementToken()) { CharTermAttribute az = filter.getAttribute(CharTermAttribute.class); OffsetAttribute o = filter.getAttribute(OffsetAttribute.class); String token = text.substring(o.startOffset(), o.endOffset()); String lemma = az.toString(); Annotation t = new Annotation(); t.setForm(token); t.setLemma(lemma); out.add(t); } if (out.size() == 0) { log.debug("Input string is empty"); } filter.close(); analyser.close(); return out; }
@Override public TokenStream create(TokenStream input) { return new EnglishMinimalStemFilter(input); }