@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset(); int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
public void testEnglishFilterFactory() throws IOException { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { Version v = VersionUtils.randomVersion(random()); Settings settings = Settings.builder() .put("index.analysis.filter.my_english.type", "stemmer") .put("index.analysis.filter.my_english.language", "english") .put("index.analysis.analyzer.my_english.tokenizer","whitespace") .put("index.analysis.analyzer.my_english.filter","my_english") .put(SETTING_VERSION_CREATED,v) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english"); assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream create = tokenFilter.create(tokenizer); IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("my_english"); assertThat(create, instanceOf(PorterStemFilter.class)); assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"}); } }
public void testDefault() throws IOException { int default_hash_count = 1; int default_bucket_size = 512; int default_hash_set_size = 1; Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash"); String source = "the quick brown fox"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); // with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't // have enough tokens to fill all the buckets, we still expect 512 tokens. assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), default_hash_count * default_bucket_size * default_hash_set_size); }
public void testSettings() throws IOException { Settings settings = Settings.builder() .put("index.analysis.filter.test_min_hash.type", "min_hash") .put("index.analysis.filter.test_min_hash.hash_count", "1") .put("index.analysis.filter.test_min_hash.bucket_count", "2") .put("index.analysis.filter.test_min_hash.hash_set_size", "1") .put("index.analysis.filter.test_min_hash.with_rotation", false) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash"); String source = "sushi"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); // despite the fact that bucket_count is 2 and hash_set_size is 1, // because with_rotation is false, we only expect 1 token here. assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1); }
public void testCorrectPositionIncrementSetting() throws IOException { Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop"); if (random().nextBoolean()) { builder.put("index.analysis.filter.my_stop.version", Version.LATEST); } else { // don't specify } builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop"); assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream create = tokenFilter.create(tokenizer); assertThat(create, instanceOf(StopFilter.class)); }
public void testMultiTerms() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42", "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se", "ONeil", "O'Neil's", "O", "Neil" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}; int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1}; assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, expectedIncr, expectedPosLen, null); }
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ public void testPartsAndCatenate() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot"; int[] expectedIncr = new int[]{1, 0, 1}; int[] expectedPosLen = new int[]{2, 1, 1}; String[] expected = new String[]{"PowerShot", "Power", "Shot" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, expectedIncr, expectedPosLen, null); }
@Override public void inform(ResourceLoader loader) throws IOException { final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader); TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; try { String formatClass = format; if (format == null || format.equals("solr")) { formatClass = SolrSynonymParser.class.getName(); } else if (format.equals("wordnet")) { formatClass = WordnetSynonymParser.class.getName(); } // TODO: expose dedup as a parameter? map = loadSynonyms(loader, formatClass, true, analyzer); } catch (ParseException e) { throw new IOException("Error parsing synonyms file:", e); } }
private int countTokensInText(String text) { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader(text)); // tokenizer.setReader(new StringReader(text)); int tokens = 0; try { tokenizer.reset(); while (tokenizer.incrementToken()) { ++tokens; } } catch (Exception e) { LOGGER.error("Error while tokenizing text. Returning.", e); } finally { IOUtils.closeQuietly(tokenizer); } return tokens; }
@Test public void testBasic() throws Exception { final Path dictPath = Files.createTempFile("rsf_", ".txt"); final long reloadInterval = 500; writeFile(dictPath, "aaa"); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new WhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new ReloadableStopFilter(tokenizer, dictPath, true, reloadInterval)); } }; String input = "aaa bbb"; assertAnalyzesTo(analyzer, input, new String[] { "bbb" }); Thread.sleep(1000L); writeFile(dictPath, "bbb"); Thread.sleep(1000L); assertAnalyzesTo(analyzer, input, new String[] { "aaa" }); }
@Test public void testBasic() throws Exception { final Path dictPath = Files.createTempFile("rkmf_", ".txt"); final long reloadInterval = 500; writeFile(dictPath, "aaa"); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new WhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new ReloadableKeywordMarkerFilter(tokenizer, dictPath, reloadInterval)); } }; String input = "aaa bbb"; assertTokenStreamContents(analyzer.tokenStream("dummy", input), new String[] { "aaa", "bbb" }, new int[] { 0, 4 }, new int[] { 3, 7 }, null, null, null, input.length(), new boolean[] { true, false }, true); Thread.sleep(1000L); writeFile(dictPath, "bbb"); Thread.sleep(1000L); assertTokenStreamContents(analyzer.tokenStream("dummy", input), new String[] { "aaa", "bbb" }, new int[] { 0, 4 }, new int[] { 3, 7 }, null, null, null, input.length(), new boolean[] { false, true }, true); }
@Test public void testBasic() throws IOException { final Pattern pattern1 = Pattern.compile("平成|昭和"); final Pattern pattern2 = Pattern.compile("[0-9]+年"); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new WhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new PatternConcatenationFilter(tokenizer, pattern1, pattern2)); } }; assertAnalyzesTo(analyzer, "平成 10年", // new String[] { "平成10年" }, // new int[] { 0 },// new int[] { 6 },// new int[] { 1 }); assertAnalyzesTo(analyzer, "aaa 昭和 56年 bbb", // new String[] { "aaa", "昭和56年", "bbb" }, // new int[] { 1, 1, 1 }); assertAnalyzesTo(analyzer, "大正 5年", // new String[] { "大正", "5年" }, // new int[] { 1, 1 }); }
@Test public void testBasic() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new WhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new StopTokenPrefixFilter(tokenizer, new String[] { "b", "dd" }, false)); } }; assertAnalyzesTo(analyzer, "aaa bbb ccc ddd eee", // new String[] { "aaa", "ccc", "eee" }, // new int[] { 0, 8, 16 }, // new int[] { 3, 11, 19 }, // new int[] { 1, 2, 2 }); assertAnalyzesTo(analyzer, "aaa", new String[] { "aaa" }); assertAnalyzesTo(analyzer, "ddd", new String[0]); assertAnalyzesTo(analyzer, "add", new String[] { "add" }); assertAnalyzesTo(analyzer, "aad", new String[] { "aad" }); assertAnalyzesTo(analyzer, "dda", new String[0]); assertAnalyzesTo(analyzer, "daa", new String[] { "daa" }); }
@Test public void testBasic() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new WhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new StopTokenSuffixFilter(tokenizer, new String[] { "b", "dd" }, false)); } }; assertAnalyzesTo(analyzer, "aaa bbb ccc ddd eee", // new String[] { "aaa", "ccc", "eee" }, // new int[] { 0, 8, 16 }, // new int[] { 3, 11, 19 }, // new int[] { 1, 2, 2 }); assertAnalyzesTo(analyzer, "aaa", new String[] { "aaa" }); assertAnalyzesTo(analyzer, "ddd", new String[0]); assertAnalyzesTo(analyzer, "add", new String[0]); assertAnalyzesTo(analyzer, "aad", new String[] { "aad" }); assertAnalyzesTo(analyzer, "dda", new String[] { "dda" }); assertAnalyzesTo(analyzer, "daa", new String[] { "daa" }); }
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { CharArraySet dict = makeDictionary("ab", "cd", "ef"); Tokenizer tokenizer = new MockTokenizer(new StringReader("abcdef"), MockTokenizer.WHITESPACE, false); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( new WhitespaceTokenizer( new StringReader( "abcdef") ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); assertTokenStreamContents(tf, new String[] { "abcdef", "ab", "cd", "ef" }, new int[] { 0, 0, 0, 0}, new int[] { 6, 6, 6, 6}, new int[] { 1, 0, 0, 0} ); }
public void testWordComponentWithLessThanMinimumLength() throws Exception { CharArraySet dict = makeDictionary("abc", "d", "efg"); Tokenizer tokenizer = new MockTokenizer(new StringReader("abcdefg"), MockTokenizer.WHITESPACE, false); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( new WhitespaceTokenizer( new StringReader( "abcdefg") ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); // since "d" is shorter than the minimum subword size, it should not be added to the token stream assertTokenStreamContents(tf, new String[] { "abcdefg", "abc", "efg" }, new int[] { 0, 0, 0}, new int[] { 7, 7, 7}, new int[] { 1, 0, 0} ); }
public void testReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class); cgf.reset(); assertTrue(cgf.incrementToken()); assertEquals("How", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("How_the", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("the", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("the_s", term.toString()); cgf.close(); wt.setReader(new StringReader(input)); cgf.reset(); assertTrue(cgf.incrementToken()); assertEquals("How", term.toString()); }
public void testQueryReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); CharTermAttribute term = wt.addAttribute(CharTermAttribute.class); nsf.reset(); assertTrue(nsf.incrementToken()); assertEquals("How_the", term.toString()); assertTrue(nsf.incrementToken()); assertEquals("the_s", term.toString()); nsf.close(); wt.setReader(new StringReader(input)); nsf.reset(); assertTrue(nsf.incrementToken()); assertEquals("How_the", term.toString()); }
public void testReset() throws Exception { Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); assertTokenStreamContents(filter, new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1} ); wsTokenizer.setReader(new StringReader("please divide this sentence")); assertTokenStreamContents(filter, new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1} ); }
private int countTokensInText(String text) { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); int tokens = 0; try { tokenizer.reset(); while (tokenizer.incrementToken()) { ++tokens; } } catch (Exception e) { LOGGER.error("Error while tokenizing text. Returning.", e); } finally { IOUtils.closeQuietly(tokenizer); } return tokens; }
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnnotatorTokenFilter( textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()); final TokenStream animalsStream = new AnnotatorTokenFilter( textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); doc.add(new TextField(ANIMAL_FIELD, animalsStream)); writer.addDocument(doc); }
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnyAnnotationTokenFilter(new AnnotatorTokenFilter( textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors())); final TokenStream animalsStream = new AnyAnnotationTokenFilter(new AnnotatorTokenFilter( textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals())); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); doc.add(new TextField(ANIMAL_FIELD, animalsStream)); writer.addDocument(doc); }
public void testOverlappingAtBeginning( ) throws Exception { final CharArraySet phraseSets = new CharArraySet( Arrays.asList( "new york", "new york city", "city of new york" ), false); final String input = "new york city is great"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer( reader ); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter( in, phraseSets, false ); aptf.setReplaceWhitespaceWith( new Character( '_' ) ); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals( "new_york_city", term.toString()); assertTrue(aptf.incrementToken()); assertEquals( "is", term.toString()); assertTrue(aptf.incrementToken()); assertEquals( "great", term.toString()); }
public void testOverlappingAtEnd( ) throws Exception { final CharArraySet phraseSets = new CharArraySet( Arrays.asList( "new york", "new york city", "city of new york" ), false); final String input = "the great city of new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer( reader ); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter( in, phraseSets, false ); aptf.setReplaceWhitespaceWith( new Character( '_' ) ); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals( "the", term.toString()); assertTrue(aptf.incrementToken()); assertEquals( "great", term.toString()); assertTrue(aptf.incrementToken()); assertEquals( "city_of_new_york", term.toString()); }
public void testIncompletePhrase( ) throws Exception { final CharArraySet phraseSets = new CharArraySet( Arrays.asList( "big apple", "new york city", "property tax", "three word phrase"), false); final String input = "some new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer( reader ); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter( in, phraseSets, false ); aptf.setReplaceWhitespaceWith( new Character( '_' ) ); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals( "some", term.toString()); assertTrue(aptf.incrementToken()); assertEquals( "new", term.toString()); assertTrue(aptf.incrementToken()); assertEquals( "york", term.toString()); }
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { CharArraySet dict = makeDictionary("ab", "cd", "ef"); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader( "abcdef") ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); assertTokenStreamContents(tf, new String[] { "abcdef", "ab", "cd", "ef" }, new int[] { 0, 0, 2, 4}, new int[] { 6, 2, 4, 6}, new int[] { 1, 0, 0, 0} ); }
public void testWordComponentWithLessThanMinimumLength() throws Exception { CharArraySet dict = makeDictionary("abc", "d", "efg"); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader( "abcdefg") ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); // since "d" is shorter than the minimum subword size, it should not be added to the token stream assertTokenStreamContents(tf, new String[] { "abcdefg", "abc", "efg" }, new int[] { 0, 0, 4}, new int[] { 7, 3, 7}, new int[] { 1, 0, 0} ); }
public void testReset() throws Exception { CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung"); Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader( "Rindfleischüberwachungsgesetz")); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, wsTokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class); tf.reset(); assertTrue(tf.incrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); assertTrue(tf.incrementToken()); assertEquals("Rind", termAtt.toString()); wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz")); tf.reset(); assertTrue(tf.incrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); }
public void testRetainMockAttribute() throws Exception { CharArraySet dict = makeDictionary("abc", "d", "efg"); Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg")); TokenStream stream = new MockRetainAttributeFilter(tokenizer); stream = new DictionaryCompoundWordTokenFilter( TEST_VERSION_CURRENT, stream, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); stream.reset(); while (stream.incrementToken()) { assertTrue("Custom attribute value was lost", retAtt.getRetain()); } }
public void testReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class); cgf.reset(); assertTrue(cgf.incrementToken()); assertEquals("How", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("How_the", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("the", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("the_s", term.toString()); wt.setReader(new StringReader(input)); cgf.reset(); assertTrue(cgf.incrementToken()); assertEquals("How", term.toString()); }
public void testQueryReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); CharTermAttribute term = wt.addAttribute(CharTermAttribute.class); nsf.reset(); assertTrue(nsf.incrementToken()); assertEquals("How_the", term.toString()); assertTrue(nsf.incrementToken()); assertEquals("the_s", term.toString()); wt.setReader(new StringReader(input)); nsf.reset(); assertTrue(nsf.incrementToken()); assertEquals("How_the", term.toString()); }
public void testReset() throws Exception { Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); assertTokenStreamContents(filter, new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1} ); wsTokenizer.setReader(new StringReader("please divide this sentence")); assertTokenStreamContents(filter, new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1} ); }
@Override protected TokenStreamComponents createComponents(final String fieldName) { Tokenizer source = new WhitespaceTokenizer(); TokenStream result = null; if (stopFilterFactory != null) { result = this.stopFilterFactory.create(source); } else { logger.warn(".createComponents: stopFilter disabled"); } if (AnalyzerType.QUERY.equals(analyzerType)) { if (synonymFilterFactory != null) { result = this.synonymFilterFactory.create(result == null ? source : result); } else { logger.warn(".createComponents: synonymFilter disabled"); } } return new TokenStreamComponents(source, result == null ? source : result); }
@Test public void testOverlappingAtBeginning() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "new york", "new york city", "city of new york"), false); final String input = "new york city is great"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("new_york_city", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("is", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("great", term.toString()); }
@Test public void testOverlappingAtEnd() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "new york", "new york city", "city of new york"), false); final String input = "the great city of new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("the", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("great", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("city_of_new_york", term.toString()); }
@Test public void testIncompletePhrase() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "big apple", "new york city", "property tax", "three word phrase"), false); final String input = "some new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("some", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("new", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("york", term.toString()); }