@Override public void inform(ResourceLoader loader) throws IOException { final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader); TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; try { String formatClass = format; if (format == null || format.equals("solr")) { formatClass = SolrSynonymParser.class.getName(); } else if (format.equals("wordnet")) { formatClass = WordnetSynonymParser.class.getName(); } // TODO: expose dedup as a parameter? map = loadSynonyms(loader, formatClass, true, analyzer); } catch (ParseException e) { throw new IOException("Error parsing synonyms file:", e); } }
/** concat numbers + words + all */ public void testLotsOfConcatenating() throws Exception { final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; /* analyzer that uses whitespace + wdf */ Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); } }; assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 3, 7, 15, 7, 11, 15, 15 }, new int[] { 1, 0, 0, 1, 1, 0, 1 }); }
/** concat numbers + words + all + preserve original */ public void testLotsOfConcatenating2() throws Exception { final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; /* analyzer that uses whitespace + wdf */ Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); } }; assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, new int[] { 15, 3, 7, 15, 7, 11, 15, 15 }, new int[] { 1, 0, 0, 0, 1, 1, 0, 1 }); }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { int numIterations = atLeast(5); for (int i = 0; i < numIterations; i++) { final int flags = random().nextInt(512); final CharArraySet protectedWords; if (random().nextBoolean()) { protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); } }; // TODO: properly support positionLengthAttribute checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false); } }
/** blast some enormous random strings through the analyzer */ public void testRandomHugeStrings() throws Exception { int numIterations = atLeast(5); for (int i = 0; i < numIterations; i++) { final int flags = random().nextInt(512); final CharArraySet protectedWords; if (random().nextBoolean()) { protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); } }; // TODO: properly support positionLengthAttribute checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false); } }
public void testEmptyTerm() throws IOException { Random random = random(); for (int i = 0; i < 512; i++) { final int flags = i; final CharArraySet protectedWords; if (random.nextBoolean()) { protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); } }; // depending upon options, this thing may or may not preserve the empty term checkAnalysisConsistency(random, a, random.nextBoolean(), ""); } }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { int numIterations = atLeast(5); for (int i = 0; i < numIterations; i++) { final int flags = random().nextInt(512); final CharArraySet protectedWords; if (random().nextBoolean()) { protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); } }; checkRandomData(random(), a, 200, 20, false, false); } }
public void testEmptyTerm() throws IOException { Random random = random(); for (int i = 0; i < 512; i++) { final int flags = i; final CharArraySet protectedWords; if (random.nextBoolean()) { protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false); } else { protectedWords = null; } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); } }; // depending upon options, this thing may or may not preserve the empty term checkAnalysisConsistency(random, a, random.nextBoolean(), ""); } }
@Override public void inform(ResourceLoader loader) throws IOException { final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_43, reader) : factory.create(reader); TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_43, tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; try { String formatClass = format; if (format == null || format.equals("solr")) { formatClass = SolrSynonymParser.class.getName(); } else if (format.equals("wordnet")) { formatClass = WordnetSynonymParser.class.getName(); } // TODO: expose dedup as a parameter? map = loadSynonyms(loader, formatClass, true, analyzer); } catch (ParseException e) { throw new IOException("Error parsing synonyms file:", e); } }
public void testMissingPayload() throws Exception { Directory dir = newDirectory(); // MockAnalyzer minus maybePayload else it sometimes stuffs in an 8-byte payload! Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true, 100); tokenizer.setEnableChecks(true); MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET); return new TokenStreamComponents(tokenizer, filt); } }; IndexWriterConfig iwc = newIndexWriterConfig(a); iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat())); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); doc.add(newTextField("id", "id", Field.Store.NO)); try { w.addDocument(doc); w.commit(); fail("didn't hit expected exception"); } catch (IllegalArgumentException iae) { // expected } w.close(); dir.close(); }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new ReversePathHierarchyTokenizer(newAttributeFactory(), reader, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP); return new TokenStreamComponents(tokenizer, tokenizer); } }; // TODO: properly support positionLengthAttribute checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false); }
/** blast some random large strings through the analyzer */ public void testRandomHugeStrings() throws Exception { Random random = random(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new ReversePathHierarchyTokenizer(newAttributeFactory(), reader, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP); return new TokenStreamComponents(tokenizer, tokenizer); } }; // TODO: properly support positionLengthAttribute checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027, false, false); }
public void testEmptyTerm() throws IOException { for (final String lang : SNOWBALL_LANGS) { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang)); } }; checkOneTerm(a, "", ""); } }
public void checkRandomStrings(final String snowballLanguage) throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new MockTokenizer(reader); return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage)); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new WhitespaceTokenizer(); TokenStream result = new PatternReplaceFilter(tokenizer, Pattern.compile("^([\\.!\\?,:;\"'\\(\\)]*)(.*?)([\\.!\\?,:;\"'\\(\\)]*)$"), "$2", true); result = new PatternReplaceFilter(result, Pattern.compile("'s"), "s", true); return new TokenStreamComponents(tokenizer, result); }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new ReversePathHierarchyTokenizer(reader); return new TokenStreamComponents(tokenizer, tokenizer); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); }
/** blast some random large strings through the analyzer */ public void testRandomHugeStrings() throws Exception { Random random = random(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new ReversePathHierarchyTokenizer(reader); return new TokenStreamComponents(tokenizer, tokenizer); } }; checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 1027); }
public void testEmptyTerm() throws IOException { for (final String lang : SNOWBALL_LANGS) { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang)); } }; checkOneTermReuse(a, "", ""); } }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); return new TokenStreamComponents(source, new PortugueseLightStemFilter(result)); }
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink)); } }; checkOneTerm(a, "quilométricas", "quilométricas"); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new PortugueseLightStemFilter(tokenizer)); } }; checkOneTermReuse(a, "", ""); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(result)); }
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new KeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink)); } }; checkOneTerm(a, "quilométricas", "quilométricas"); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new PortugueseMinimalStemFilter(tokenizer)); } }; checkOneTermReuse(a, "", ""); }
@Override public void setUp() throws Exception { super.setUp(); // Create it in-memory indexApi = new IndexApi(new RAMDirectory(), true); indexApi.registerTokenizer(IFields.PYTHON, CodeAnalyzer.createPythonStreamComponents()); TokenStreamComponents stringOrComment = CodeAnalyzer.createStringsOrCommentsStreamComponents(); indexApi.registerTokenizer(IFields.STRING, stringOrComment); indexApi.registerTokenizer(IFields.COMMENT, stringOrComment); }
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink)); } }; checkOneTerm(a, "quilométricas", "quilométricas"); }