public void testPorter2FilterFactory() throws IOException { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { Version v = VersionUtils.randomVersion(random()); Settings settings = Settings.builder() .put("index.analysis.filter.my_porter2.type", "stemmer") .put("index.analysis.filter.my_porter2.language", "porter2") .put("index.analysis.analyzer.my_porter2.tokenizer","whitespace") .put("index.analysis.analyzer.my_porter2.filter","my_porter2") .put(SETTING_VERSION_CREATED,v) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_porter2"); assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream create = tokenFilter.create(tokenizer); IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("my_porter2"); assertThat(create, instanceOf(SnowballFilter.class)); assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"}); } }
/** * Ritorna il set di stop words di default per una lingua * * @param language lingua * @return set di stop words */ public static CharArraySet getDefaultStopSet(String language) { try { if ("en".equalsIgnoreCase(language)) { return StandardAnalyzer.STOP_WORDS_SET; } else if ("es".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "spanish_stop.txt", StandardCharsets.UTF_8)); } else if ("fr".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "french_stop.txt", StandardCharsets.UTF_8)); } else if ("de".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "german_stop.txt", StandardCharsets.UTF_8)); } else if ("pl".equalsIgnoreCase(language)) { return WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class, "stopwords.txt", StandardCharsets.UTF_8), "#"); } else if ("pt".equalsIgnoreCase(language) || "br".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "portuguese_stop.txt", StandardCharsets.UTF_8)); } else if ("it".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "italian_stop.txt", StandardCharsets.UTF_8)); } else if ("cz".equalsIgnoreCase(language) || "sk".equalsIgnoreCase(language)) { return WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class, "stopwords.txt", StandardCharsets.UTF_8), "#"); } else if ("tr".equalsIgnoreCase(language)) { return TurkishAnalyzer.loadStopwordSet(false, TurkishAnalyzer.class, "stopwords.txt", "#"); } else if ("ru".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "russian_stop.txt", StandardCharsets.UTF_8)); } else if ("ro".equalsIgnoreCase(language)) { return RomanianAnalyzer.loadStopwordSet(false, RomanianAnalyzer.class, "stopwords.txt", "#"); } else if ("bg".equalsIgnoreCase(language)) { return BulgarianAnalyzer.loadStopwordSet(false, BulgarianAnalyzer.class, "stopwords.txt", "#"); } else if ("nl".equalsIgnoreCase(language)) { return WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, "dutch_stop.txt", StandardCharsets.UTF_8)); } } catch (Exception ignored) { throw new RuntimeException("Unable to load default stopword set"); } return StandardAnalyzer.STOP_WORDS_SET; }
@Override public TokenStream create(TokenStream tokenStream) { return new SnowballFilter(tokenStream, "Russian"); }
@Override public TokenStream create(TokenStream tokenStream) { return new SnowballFilter(tokenStream, language); }
public void testBasic() throws IOException { TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter( new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false)), "English")); assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0}); }
public void testComposition() throws IOException { TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(new KeywordRepeatFilter( new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false))), "English")); assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0}); }
@Override public TokenStream create(TokenStream tokenStream) { return new SnowballFilter(tokenStream, "French"); }