private void testToken(String source, String expected) throws IOException { Index index = new Index("test", "_na_"); Settings settings = Settings.builder() .put("index.analysis.filter.myStemmer.type", "polish_stem") .build(); TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin()); TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer"); Tokenizer tokenizer = new KeywordTokenizer(); tokenizer.setReader(new StringReader(source)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset(); assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); }
public void testIgnoreWhitespace() throws Exception { String withSpace = "foo bar"; String withoutSpace = "foobar"; String withPunctuation = "foo-bar"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "en", "strength", "primary", "alternate", "shifted", "variableTop", " "); TokenStream tsWithSpace = factory.create( new KeywordTokenizer(new StringReader(withSpace))); TokenStream tsWithoutSpace = factory.create( new KeywordTokenizer(new StringReader(withoutSpace))); assertCollatesToSame(tsWithSpace, tsWithoutSpace); // now assert that punctuation still matters: foo-bar < foo bar tsWithSpace = factory.create( new KeywordTokenizer(new StringReader(withSpace))); TokenStream tsWithPunctuation = factory.create( new KeywordTokenizer(new StringReader(withPunctuation))); assertCollation(tsWithPunctuation, tsWithSpace, -1); }
public void testEmptyTerm() throws IOException { Random random = random(); final int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random.nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); } }; checkAnalysisConsistency(random, analyzer, random.nextBoolean(), ""); } }
public void testSupplementaryCharacters() throws IOException { final String s = TestUtil.randomUnicodeString(random(), 10); final int codePointCount = s.codePointCount(0, s.length()); final int minGram = TestUtil.nextInt(random(), 1, 3); final int maxGram = TestUtil.nextInt(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new EdgeNGramTokenFilter(tk, minGram, maxGram); final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); tk.reset(); for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); final int end = Character.offsetByCodePoints(s, 0, i); assertEquals(s.substring(0, end), termAtt.toString()); } assertFalse(tk.incrementToken()); }
public void testSupplementaryCharacters() throws IOException { final String s = TestUtil.randomUnicodeString(random(), 10); final int codePointCount = s.codePointCount(0, s.length()); final int minGram = TestUtil.nextInt(random(), 1, 3); final int maxGram = TestUtil.nextInt(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(tk, minGram, maxGram); final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); tk.reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); final int startIndex = Character.offsetByCodePoints(s, 0, start); final int endIndex = Character.offsetByCodePoints(s, 0, end); assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); } } assertFalse(tk.incrementToken()); }
public void testEmptyTerm() throws IOException { Random random = random(); for (int i = 0; i < 512; i++) { final int flags = i; final CharArraySet protectedWords; if (random.nextBoolean()) { protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords)); } }; // depending upon options, this thing may or may not preserve the empty term checkAnalysisConsistency(random, a, random.nextBoolean(), ""); } }
public void testEmptyTerm() throws IOException { Random random = random(); for (int i = 0; i < 512; i++) { final int flags = i; final CharArraySet protectedWords; if (random.nextBoolean()) { protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); } }; // depending upon options, this thing may or may not preserve the empty term checkAnalysisConsistency(random, a, random.nextBoolean(), ""); } }
public void testRandomStrings() throws IOException { for (int i = 0; i < 10000; i++) { String text = TestUtil.randomUnicodeString(random(), 100); int min = TestUtil.nextInt(random(), 0, 100); int max = TestUtil.nextInt(random(), 0, 100); int count = text.codePointCount(0, text.length()); if(min>max){ int temp = min; min = max; max = temp; } boolean expected = count >= min && count <= max; TokenStream stream = new KeywordTokenizer(new StringReader(text)); stream = new CodepointCountFilter(stream, min, max); stream.reset(); assertEquals(expected, stream.incrementToken()); stream.end(); stream.close(); } }
/** * For the supplied language, run the stemmer against all strings in voc.txt * The output should be the same as the string in output.txt */ private void assertCorrectOutput(final String snowballLanguage, String dataDirectory) throws IOException { if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new KeywordTokenizer(reader); return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage)); } }; assertVocabulary(a, getDataFile("TestSnowballVocabData.zip"), dataDirectory + "/voc.txt", dataDirectory + "/output.txt"); }
public void testNormalization() throws IOException { String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("locale", "tr"); args.put("strength", "primary"); args.put("decomposition", "canonical"); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(turkishUpperCase))); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(turkishLowerCase))); assertCollatesToSame(tsUpper, tsLower); }
public void testSecondaryStrength() throws IOException { String upperCase = "TESTING"; String lowerCase = "testing"; ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("locale", "en"); args.put("strength", "secondary"); args.put("decomposition", "no"); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(upperCase))); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(lowerCase))); assertCollatesToSame(tsUpper, tsLower); }
public void testIgnorePunctuation() throws IOException { String withPunctuation = "foo-bar"; String withoutPunctuation = "foo bar"; ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("locale", "en"); args.put("strength", "primary"); args.put("alternate", "shifted"); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsPunctuation = factory.create( new KeywordTokenizer(new StringReader(withPunctuation))); TokenStream tsWithoutPunctuation = factory.create( new KeywordTokenizer(new StringReader(withoutPunctuation))); assertCollatesToSame(tsPunctuation, tsWithoutPunctuation); }
public void testIgnoreWhitespace() throws IOException { String withSpace = "foo bar"; String withoutSpace = "foobar"; String withPunctuation = "foo-bar"; ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("locale", "en"); args.put("strength", "primary"); args.put("alternate", "shifted"); args.put("variableTop", " "); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsWithSpace = factory.create( new KeywordTokenizer(new StringReader(withSpace))); TokenStream tsWithoutSpace = factory.create( new KeywordTokenizer(new StringReader(withoutSpace))); assertCollatesToSame(tsWithSpace, tsWithoutSpace); // now assert that punctuation still matters: foo-bar < foo bar tsWithSpace = factory.create( new KeywordTokenizer(new StringReader(withSpace))); TokenStream tsWithPunctuation = factory.create( new KeywordTokenizer(new StringReader(withPunctuation))); assertCollation(tsWithPunctuation, tsWithSpace, -1); }
public void testUpperCaseFirst() throws IOException { String lower = "resume"; String upper = "Resume"; ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("locale", "en"); args.put("strength", "tertiary"); args.put("caseFirst", "upper"); factory.init(args); factory.inform(new StringMockResourceLoader("")); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(lower))); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(upper))); assertCollation(tsUpper, tsLower, -1); }
public void testEmptyTerm() throws IOException { Random random = random(); for (int i = 0; i < 512; i++) { final int flags = i; final CharArraySet protectedWords; if (random.nextBoolean()) { protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); } }; // depending upon options, this thing may or may not preserve the empty term checkAnalysisConsistency(random, a, random.nextBoolean(), ""); } }
public void testEmptyTerm() throws IOException { Random random = random(); for (int i = 0; i < 1024; i++) { final int flags = i; final CharArraySet protectedWords; if (random.nextBoolean()) { protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(tokenizer, flags, protectedWords)); } }; // depending upon options, this thing may or may not preserve the empty term checkAnalysisConsistency(random, a, random.nextBoolean(), ""); } }
public void testSupplementaryCharacters() throws IOException { final String s = _TestUtil.randomUnicodeString(random(), 10); final int codePointCount = s.codePointCount(0, s.length()); final int minGram = _TestUtil.nextInt(random(), 1, 3); final int maxGram = _TestUtil.nextInt(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); tk.reset(); for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); final int end = Character.offsetByCodePoints(s, 0, i); assertEquals(s.substring(0, end), termAtt.toString()); } assertFalse(tk.incrementToken()); }
public void testSupplementaryCharacters() throws IOException { final String s = _TestUtil.randomUnicodeString(random(), 10); final int codePointCount = s.codePointCount(0, s.length()); final int minGram = _TestUtil.nextInt(random(), 1, 3); final int maxGram = _TestUtil.nextInt(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); tk.reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); final int startIndex = Character.offsetByCodePoints(s, 0, start); final int endIndex = Character.offsetByCodePoints(s, 0, end); assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); } } assertFalse(tk.incrementToken()); }
@Test public void testMetaphoneWords() throws Exception { Index index = new Index("test", "_na_"); Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.myStemmer.type", "br_metaphone") .build(); AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin()); TokenFilterFactory filterFactory = analysisService.tokenFilter("br_metaphone"); Tokenizer tokenizer = new KeywordTokenizer(); Map<String,String> words = buildWordList(); Set<String> inputWords = words.keySet(); for(String word : inputWords) { tokenizer.setReader(new StringReader(word)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset(); assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(words.get(word))); ts.close(); } }
@Inject public EudexTokenizerFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.indexSettings(), name, settings); this.factory = new EudexAttributeFactory(); this.bufferSize = settings.getAsInt("buffersize", KeywordTokenizer.DEFAULT_BUFFER_SIZE); }
@Inject public EudexAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.indexSettings(), name, settings); this.bufferSize = settings.getAsInt("buffersize", KeywordTokenizer.DEFAULT_BUFFER_SIZE); }
@Inject public IcuCollationTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); Collator collator = IcuCollationKeyAnalyzerProvider.createCollator(settings); this.factory = new ICUCollationAttributeFactory(collator); this.bufferSize = settings.getAsInt("buffer_size", KeywordTokenizer.DEFAULT_BUFFER_SIZE); }
public void testEmptyTerm() throws IOException { Random random = random(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new WordTokenFilter(tokenizer)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer)); } }; checkOneTerm(a, "", ""); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer)); } }; checkOneTerm(a, "", ""); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new JapaneseKatakanaStemFilter(tokenizer)); } }; checkOneTerm(a, "", ""); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer)); } }; checkOneTerm(a, "", ""); }
public void testOptimizer() throws Exception { String rules = "a > b; b > c;"; // convert a's to b's and b's to c's Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD); assertTrue(custom.getFilter() == null); new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom); assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]"))); }
public void testOptimizerSurrogate() throws Exception { String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD); assertTrue(custom.getFilter() == null); new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom); assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]"))); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin"))); } }; checkOneTerm(a, "", ""); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new ICUFoldingFilter(tokenizer)); } }; checkOneTerm(a, "", ""); }
public void testBasicUsage() throws Exception { String turkishUpperCase = "I WİLL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "tr", "strength", "primary"); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(turkishUpperCase))); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(turkishLowerCase))); assertCollatesToSame(tsUpper, tsLower); }
public void testNormalization() throws Exception { String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "tr", "strength", "primary", "decomposition", "canonical"); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(turkishUpperCase))); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(turkishLowerCase))); assertCollatesToSame(tsUpper, tsLower); }
public void testSecondaryStrength() throws Exception { String upperCase = "TESTING"; String lowerCase = "testing"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "en", "strength", "secondary", "decomposition", "no"); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(upperCase))); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(lowerCase))); assertCollatesToSame(tsUpper, tsLower); }