public void testEnglishFilterFactory() throws IOException { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { Version v = VersionUtils.randomVersion(random()); Settings settings = Settings.builder() .put("index.analysis.filter.my_english.type", "stemmer") .put("index.analysis.filter.my_english.language", "english") .put("index.analysis.analyzer.my_english.tokenizer","whitespace") .put("index.analysis.analyzer.my_english.filter","my_english") .put(SETTING_VERSION_CREATED,v) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english"); assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream create = tokenFilter.create(tokenizer); IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("my_english"); assertThat(create, instanceOf(PorterStemFilter.class)); assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"}); } }
public void testOverride() throws IOException { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(); builder.add("booked", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( tokenizer, builder.build())); assertTokenStreamContents(stream, new String[] {"books"}); }
public void testIgnoreCase() throws IOException { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); builder.add("boOkEd", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( tokenizer, builder.build())); assertTokenStreamContents(stream, new String[] {"books"}); }
public void testNoOverrides() throws IOException { StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("book")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter( tokenizer, builder.build())); assertTokenStreamContents(stream, new String[] {"book"}); }
public void testKeywords() throws IOException { // our stemdict stems dogs to 'cat' Reader reader = new StringReader("testing dogs"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); Map<String,String> args = new HashMap<String,String>(); ResourceLoader loader = new StringMockResourceLoader("dogs\tcat"); args.put("dictionary", "stemdict.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); assertTokenStreamContents(ts, new String[] { "test", "cat" }); }
public void testKeywordsCaseInsensitive() throws IOException { Reader reader = new StringReader("testing DoGs"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); Map<String,String> args = new HashMap<String,String>(); ResourceLoader loader = new StringMockResourceLoader("dogs\tcat"); args.put("dictionary", "stemdict.txt"); args.put("ignoreCase", "true"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); assertTokenStreamContents(ts, new String[] { "test", "cat" }); }
public void testKeywords() throws IOException { Reader reader = new StringReader("dogs cats"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); Map<String,String> args = new HashMap<String,String>(); ResourceLoader loader = new StringMockResourceLoader("cats"); args.put("protected", "protwords.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); assertTokenStreamContents(ts, new String[] { "dog", "cats" }); }
public void testKeywordsCaseInsensitive() throws IOException { Reader reader = new StringReader("dogs cats Cats"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); Map<String,String> args = new HashMap<String,String>(); ResourceLoader loader = new StringMockResourceLoader("cats"); args.put("protected", "protwords.txt"); args.put("ignoreCase", "true"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" }); }
public void testOverride() throws IOException { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. CharArrayMap<String> dictionary = new CharArrayMap<String>(TEST_VERSION_CURRENT, 1, false); dictionary.put("booked", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked")); TokenStream stream = new PorterStemFilter( new StemmerOverrideFilter(tokenizer, dictionary)); assertTokenStreamContents(stream, new String[] { "books" }); }
@Override public TokenStream create(TokenStream tokenStream) { return new PorterStemFilter(tokenStream); }
@Override public PorterStemFilter create(TokenStream input) { return new PorterStemFilter(input); }
@Override public TokenFilter wrapper(TokenStream input) { return new PorterStemFilter(input); }
@Override protected TokenStreamComponents createComponents(String s) { Tokenizer source = new LowerCaseTokenizer(); return new TokenStreamComponents(source, new PorterStemFilter(source)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new LowerCaseTokenizer(LUCENE_VERSION, reader); return new TokenStreamComponents(source, new PorterStemFilter(source)); }
@Override public TokenStream tokenStream(String fieldName, Reader reader) { StopFilter stopFilter = new StopFilter(true, new LowerCaseTokenizer(reader), stopWords); stopFilter.setEnablePositionIncrements(true); return new PorterStemFilter(stopFilter); }