private void addAttributes(TokenStream tokenStream) { tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(ReadingAttribute.class); tokenStream.addAttribute(PartOfSpeechAttribute.class); tokenStream.addAttribute(InflectionAttribute.class); tokenStream.addAttribute(BaseFormAttribute.class); }
private void readBaseForm(TokenStream tokenStream, LuceneToken token) { BaseFormAttribute baseForm = tokenStream .getAttribute(BaseFormAttribute.class); if (baseForm != null) { token.setBaseForm(baseForm.getBaseForm()); } }
@Override public List<String> segmentWords(String text) { List<String> ret = new ArrayList<String>(); StringReader textreader = new StringReader(text); JapaneseTokenizer segmenter = new JapaneseTokenizer(textreader, null, true, JapaneseTokenizer.Mode.SEARCH); JaStemmer.lemma.clear(); CharTermAttribute termAtt = segmenter.getAttribute(CharTermAttribute.class); BaseFormAttribute baseAtt = segmenter.getAttribute(BaseFormAttribute.class); try { segmenter.reset(); while (segmenter.incrementToken()){ //segmenter.clearAttributes(); ret.add(termAtt.toString()); if(baseAtt.getBaseForm()!=null) JaStemmer.lemma.put(termAtt.toString(), baseAtt.getBaseForm()); } segmenter.close(); } catch (IOException e) { // TODO Auto-generated catch block. e.printStackTrace(); } return ret; }