public static void main(String[] args) throws IOException { List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 "); System.out.println(parse); List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文"); //System.out.println(parse1); String text11="ZW321282050000000325"; Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); tokenizer.reset(); while (tokenizer.incrementToken()){ System.out.print(new String(termAtt.toString()+" ") ); // System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" ); //System.out.print( positionIncrementAtt.getPositionIncrement() +"/"); } tokenizer.close(); }
public void testSimple() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(t, new UniqueTokenFilter(t)); } }; TokenStream test = analyzer.tokenStream("test", "this test with test"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("this")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("test")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("with")); assertThat(test.incrementToken(), equalTo(false)); }
public void testEnglishFilterFactory() throws IOException { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { Version v = VersionUtils.randomVersion(random()); Settings settings = Settings.builder() .put("index.analysis.filter.my_english.type", "stemmer") .put("index.analysis.filter.my_english.language", "english") .put("index.analysis.analyzer.my_english.tokenizer","whitespace") .put("index.analysis.analyzer.my_english.filter","my_english") .put(SETTING_VERSION_CREATED,v) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english"); assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream create = tokenFilter.create(tokenizer); IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("my_english"); assertThat(create, instanceOf(PorterStemFilter.class)); assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"}); } }
public void testPreTokenization() throws IOException { // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters final Index index = new Index("test", "_na_"); final String name = "ngr"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build(); Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f ")); assertTokenStreamContents(tokenizer, new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"}); settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build(); tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); tokenizer.setReader(new StringReader(" a!$ 9")); assertTokenStreamContents(tokenizer, new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"}); }
public void testPreTokenizationEdge() throws IOException { // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters final Index index = new Index("test", "_na_"); final String name = "ngr"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build(); Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f ")); assertTokenStreamContents(tokenizer, new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"}); settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build(); tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); tokenizer.setReader(new StringReader(" a!$ 9")); assertTokenStreamContents(tokenizer, new String[] {" a", " a!"}); }
public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { final Index index = new Index("test", "_na_"); final String name = "ngr"; Version v = randomVersion(random()); Builder builder = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3); boolean reverse = random().nextBoolean(); if (reverse) { builder.put("side", "back"); } Settings settings = builder.build(); Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(tokenizer); if (reverse) { assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class)); } else { assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class)); } } }
public void testDefault() throws IOException { int default_hash_count = 1; int default_bucket_size = 512; int default_hash_set_size = 1; Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash"); String source = "the quick brown fox"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); // with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't // have enough tokens to fill all the buckets, we still expect 512 tokens. assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), default_hash_count * default_bucket_size * default_hash_set_size); }
public void testSettings() throws IOException { Settings settings = Settings.builder() .put("index.analysis.filter.test_min_hash.type", "min_hash") .put("index.analysis.filter.test_min_hash.hash_count", "1") .put("index.analysis.filter.test_min_hash.bucket_count", "2") .put("index.analysis.filter.test_min_hash.hash_set_size", "1") .put("index.analysis.filter.test_min_hash.with_rotation", false) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash"); String source = "sushi"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); // despite the fact that bucket_count is 2 and hash_set_size is 1, // because with_rotation is false, we only expect 1 token here. assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1); }
public void testCorrectPositionIncrementSetting() throws IOException { Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop"); if (random().nextBoolean()) { builder.put("index.analysis.filter.my_stop.version", Version.LATEST); } else { // don't specify } builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop"); assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream create = tokenFilter.create(tokenizer); assertThat(create, instanceOf(StopFilter.class)); }
public void testMultiTerms() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42", "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se", "ONeil", "O'Neil's", "O", "Neil" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}; int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1}; assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, expectedIncr, expectedPosLen, null); }
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ public void testPartsAndCatenate() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot"; int[] expectedIncr = new int[]{1, 0, 1}; int[] expectedPosLen = new int[]{2, 1, 1}; String[] expected = new String[]{"PowerShot", "Power", "Shot" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, expectedIncr, expectedPosLen, null); }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new Tokenizer() { boolean incremented = false; CharTermAttribute term = addAttribute(CharTermAttribute.class); @Override public boolean incrementToken() throws IOException { if (incremented) { return false; } term.setLength(0).append(output); incremented = true; return true; } }; return new TokenStreamComponents(tokenizer); }
private void testToken(String source, String expected) throws IOException { Index index = new Index("test", "_na_"); Settings settings = Settings.builder() .put("index.analysis.filter.myStemmer.type", "polish_stem") .build(); TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin()); TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer"); Tokenizer tokenizer = new KeywordTokenizer(); tokenizer.setReader(new StringReader(source)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset(); assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); }
@Override public void inform(ResourceLoader loader) throws IOException { final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader); TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; try { String formatClass = format; if (format == null || format.equals("solr")) { formatClass = SolrSynonymParser.class.getName(); } else if (format.equals("wordnet")) { formatClass = WordnetSynonymParser.class.getName(); } // TODO: expose dedup as a parameter? map = loadSynonyms(loader, formatClass, true, analyzer); } catch (ParseException e) { throw new IOException("Error parsing synonyms file:", e); } }
/** * Creates a token stream that tokenizes the given string into token terms * (aka words). * * @param fieldName * the name of the field to tokenize (currently ignored). * @param reader * reader (e.g. charfilter) of the original text. can be null. * @param text * the string to tokenize * @return a new token stream */ public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (reader == null) reader = new FastStringReader(text); if (pattern == NON_WORD_PATTERN) { // fast path return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords)); } else if (pattern == WHITESPACE_PATTERN) { // fast path return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords)); } Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase); TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer; return new TokenStreamComponents(tokenizer, result); }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = tokenizerFactory.create(); TokenStream tokenStream = tokenizer; for (TokenFilterFactory tokenFilter : tokenFilters) { tokenStream = tokenFilter.create(tokenStream); } return new TokenStreamComponents(tokenizer, tokenStream); }
@Test public void testBasicUsage() throws IOException { String query = "한국어를 처리하는 예시입니다ㅋㅋ"; String[] expectedCharTerms = new String[]{"한국어", "를", " ", "처리", "하다", " ", "예시", "이다", "ㅋㅋ"}; String[] expectedTypes = new String[]{"Noun", "Josa", "Space", "Noun", "Verb", "Space", "Noun", "Adjective", "KoreanParticle"}; int[] expectedStartOffsets = new int[]{0, 3, 4, 5, 7, 9, 10, 12, 15}; int[] expectedEndOffsets = new int[]{3, 4, 5, 7, 9, 10, 12, 15, 17}; Tokenizer tokenizer = new OpenKoreanTextTokenizer(); tokenizer.setReader(new StringReader(query)); OpenKoreanTextTokenFilter tokenFilter = new OpenKoreanTextStemmer(tokenizer); TokenStreamAssertions.assertTokenStream(tokenFilter, expectedCharTerms, expectedTypes, expectedStartOffsets, expectedEndOffsets); }
@Test public void testBasicUsage() throws IOException { String query = "한국어를 처리하는 예시입니다ㅋㅋ #한국어"; String[] expectedCharTerms = new String[]{"한국어", "처리", "처리하는 예시", "예시", "#한국어"}; String[] expectedTypes = new String[]{"Noun", "Noun", "Noun", "Noun", "Hashtag"}; int[] expectedStartOffsets = new int[]{0, 5, 5, 10, 18}; int[] expectedEndOffsets = new int[]{3, 7, 12, 12, 22}; Tokenizer tokenizer = new OpenKoreanTextTokenizer(); tokenizer.setReader(new StringReader(query)); OpenKoreanTextTokenFilter tokenFilter = new OpenKoreanTextPhraseExtractor(tokenizer); TokenStreamAssertions.assertTokenStream(tokenFilter, expectedCharTerms, expectedTypes, expectedStartOffsets, expectedEndOffsets); }
@Test public void testMetaphoneWords() throws Exception { Index index = new Index("test", "_na_"); Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.myStemmer.type", "br_metaphone") .build(); AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin()); TokenFilterFactory filterFactory = analysisService.tokenFilter("br_metaphone"); Tokenizer tokenizer = new KeywordTokenizer(); Map<String,String> words = buildWordList(); Set<String> inputWords = words.keySet(); for(String word : inputWords) { tokenizer.setReader(new StringReader(word)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset(); assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(words.get(word))); ts.close(); } }
@Override public Tokenizer create() { if (reverse) { return new ReversePathHierarchyTokenizer(bufferSize, delimiter, replacement, skip); } return new PathHierarchyTokenizer(bufferSize, delimiter, replacement, skip); }
@Override public Tokenizer create() { if (matcher == null) { return new EdgeNGramTokenizer(minGram, maxGram); } else { return new EdgeNGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } }
@Override public Tokenizer create() { if (matcher == null) { return new NGramTokenizer(minGram, maxGram); } else { return new NGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } }
private static TokenStream createStackedTokenStream(String source, CharFilterFactory[] charFilterFactories, TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilterFactories, int current) { Reader reader = new FastStringReader(source); for (CharFilterFactory charFilterFactory : charFilterFactories) { reader = charFilterFactory.create(reader); } Tokenizer tokenizer = tokenizerFactory.create(); tokenizer.setReader(reader); TokenStream tokenStream = tokenizer; for (int i = 0; i < current; i++) { tokenStream = tokenFilterFactories[i].create(tokenStream); } return tokenStream; }
private void assertTokenFilter(String name, Class<?> clazz) throws IOException { Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get(name); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream stream = tokenFilter.create(tokenizer); assertThat(stream, instanceOf(clazz)); }
public void testPorter2FilterFactory() throws IOException { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { Version v = VersionUtils.randomVersion(random()); Settings settings = Settings.builder() .put("index.analysis.filter.my_porter2.type", "stemmer") .put("index.analysis.filter.my_porter2.language", "porter2") .put("index.analysis.analyzer.my_porter2.tokenizer","whitespace") .put("index.analysis.analyzer.my_porter2.filter","my_porter2") .put(SETTING_VERSION_CREATED,v) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_porter2"); assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream create = tokenFilter.create(tokenizer); IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("my_porter2"); assertThat(create, instanceOf(SnowballFilter.class)); assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"}); } }
public void testDefault() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_ascii_folding.type", "asciifolding") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding"); String source = "Ansprüche"; String[] expected = new String[]{"Anspruche"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testNoTokenChars() throws IOException { final Index index = new Index("test", "_na_"); final String name = "ngr"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4).putArray("token_chars", new String[0]).build(); Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); tokenizer.setReader(new StringReader("1.34")); assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"}); }
public void testCaseInsensitiveMapping() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keep_filter"); assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class)); String source = "hello small world"; String[] expected = new String[]{"hello", "world"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1, 2}); }
public void testCaseSensitiveMapping() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_case_sensitive_keep_filter"); assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class)); String source = "Hello small world"; String[] expected = new String[]{"Hello"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1}); }
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ public void testPartsAndCatenate() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot"; String[] expected = new String[]{"Power", "PowerShot", "Shot" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testDefault() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testCatenateWords() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testCatenateNumbers() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testCatenateAll() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testSplitOnCaseChange() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot"; String[] expected = new String[]{"PowerShot"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testPreserveOriginal() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testStemEnglishPossessive() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.my_word_delimiter.type", type) .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false") .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }