@Test public void testNoPos() throws IOException { final Set<String> posTags = new HashSet<>(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH); final PartOfSpeechAttribute posAtt = tokenizer.addAttribute(PartOfSpeechAttribute.class); return new TokenStreamComponents(tokenizer, new PosConcatenationFilter(tokenizer, posTags, new PosConcatenationFilter.PartOfSpeechSupplier() { @Override public String get() { return posAtt.getPartOfSpeech(); } })); } }; assertAnalyzesTo(analyzer, "明日は詳細設計です。", // new String[] { "明日", "は", "詳細", "設計", "です", "。" }, // new int[] { 0, 2, 3, 5, 7, 9 }, // new int[] { 2, 3, 5, 7, 9, 10 }, // new int[] { 1, 1, 1, 1, 1, 1 }); }
@Test public void testBasic() throws IOException { final Set<String> posTags = new HashSet<>(); posTags.add("名詞-副詞可能"); posTags.add("名詞-形容動詞語幹"); posTags.add("名詞-サ変接続"); posTags.add("名詞-一般"); posTags.add("名詞-接尾-一般"); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH); final PartOfSpeechAttribute posAtt = tokenizer.addAttribute(PartOfSpeechAttribute.class); return new TokenStreamComponents(tokenizer, new PosConcatenationFilter(tokenizer, posTags, new PosConcatenationFilter.PartOfSpeechSupplier() { @Override public String get() { return posAtt.getPartOfSpeech(); } })); } }; assertAnalyzesTo(analyzer, "歯科医院の歯科衛生士", // new String[] { "歯科医院", "の", "歯科衛生士" }, // new int[] { 0, 4, 5 }, // new int[] { 4, 5, 10 }, // new int[] { 1, 1, 1 }); assertAnalyzesTo(analyzer, "明日は詳細設計です。", // new String[] { "明日", "は", "詳細設計", "です", "。" }, // new int[] { 0, 2, 3, 7, 9 }, // new int[] { 2, 3, 7, 9, 10 }, // new int[] { 1, 1, 1, 1, 1 }); }
private void addAttributes(TokenStream tokenStream) { tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(ReadingAttribute.class); tokenStream.addAttribute(PartOfSpeechAttribute.class); tokenStream.addAttribute(InflectionAttribute.class); tokenStream.addAttribute(BaseFormAttribute.class); }
private void readPartOfSpeech(TokenStream tokenStream, LuceneToken token) { PartOfSpeechAttribute partOfSpeech = tokenStream .getAttribute(PartOfSpeechAttribute.class); if (partOfSpeech != null) { String str = partOfSpeech.getPartOfSpeech(); if (str != null) { token.setPartOfSpeech(LuceneUtil.translatePartOfSpeech(str)); } } }
@Override public TokenStream create(final TokenStream tokenStream) { final PartOfSpeechAttribute posAtt = tokenStream.addAttribute(PartOfSpeechAttribute.class); return new PosConcatenationFilter(tokenStream, posTags, () -> posAtt.getPartOfSpeech()); }