public void testSimple() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(t, new UniqueTokenFilter(t)); } }; TokenStream test = analyzer.tokenStream("test", "this test with test"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("this")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("test")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("with")); assertThat(test.incrementToken(), equalTo(false)); }
public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { final Index index = new Index("test", "_na_"); final String name = "ngr"; Version v = randomVersion(random()); Builder builder = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3); boolean reverse = random().nextBoolean(); if (reverse) { builder.put("side", "back"); } Settings settings = builder.build(); Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(tokenizer); if (reverse) { assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class)); } else { assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class)); } } }
public void testCountPositions() throws IOException { // We're looking to make sure that we: Token t1 = new Token(); // Don't count tokens without an increment t1.setPositionIncrement(0); Token t2 = new Token(); t2.setPositionIncrement(1); // Count normal tokens with one increment Token t3 = new Token(); t2.setPositionIncrement(2); // Count funny tokens with more than one increment int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them Token[] tokens = new Token[] {t1, t2, t3}; Collections.shuffle(Arrays.asList(tokens), random()); final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); // TODO: we have no CannedAnalyzer? Analyzer analyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(new MockTokenizer(), tokenStream); } }; assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7)); }
public void testRecursion3() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = true; add("zoo zoo", "zoo", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; assertAnalyzesTo(a, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "$", "zoo" }, new int[] { 1, 0, 1, 1, 1 }); }
/** * Test that invalid arguments result in exception */ public void testInvalidArguments() throws Exception { for (final String arg : new String[]{"minWordLength", "maxTokenLength", "maxWordCount"}) { try { Reader reader = new StringReader("foo foobar super-duper-trooper"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); tokenFilterFactory("Capitalization", "keep", "and the it BIG", "onlyFirstWord", "false", arg, "-3", "okPrefix", "McK", "forceFirstLetter", "true").create(stream); fail(); } catch (IllegalArgumentException expected) { assertTrue(expected.getMessage().contains(arg + " must be greater than or equal to zero") || expected.getMessage().contains(arg + " must be greater than zero")); } } }
public void testRandomString() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new PatternCaptureGroupTokenFilter(tokenizer, false, Pattern.compile("((..)(..))"))); } }; checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); }
/** * Initializes the tests by adding 4 identical documents to the index. */ @Override public void setUp() throws Exception { super.setUp(); // create test index mDirectory = newDirectory(); final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)) .setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity())); addDocument(writer, "1", "I think it should work."); addDocument(writer, "2", "I think it should work."); addDocument(writer, "3", "I think it should work."); addDocument(writer, "4", "I think it should work."); reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); searcher.setSimilarity(new DefaultSimilarity()); }
public void testHyphenationCompoundWordsDELongestMatch() throws Exception { CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv"); InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter .getHyphenationTree(is); // the word basket will not be added due to the longest match option HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true); assertTokenStreamContents(tf, new String[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 } ); }
public void test() throws IOException { PrefixAwareTokenFilter ts; ts = new PrefixAwareTokenFilter( new SingleTokenTokenStream(createToken("a", 0, 1)), new SingleTokenTokenStream(createToken("b", 0, 1))); assertTokenStreamContents(ts, new String[] { "a", "b" }, new int[] { 0, 1 }, new int[] { 1, 2 }); // prefix and suffix using 2x prefix ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)), new MockTokenizer(new StringReader("hello world"), MockTokenizer.WHITESPACE, false)); ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0))); assertTokenStreamContents(ts, new String[] { "^", "hello", "world", "$" }, new int[] { 0, 0, 6, 11 }, new int[] { 0, 5, 11, 11 }); }
private float checkPhraseQuery(Document doc, PhraseQuery query, int slop, int expectedNumResults) throws Exception { query.setSlop(slop); Directory ramDir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); writer.addDocument(doc); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); MaxFreqCollector c = new MaxFreqCollector(); searcher.search(query, c); assertEquals("slop: "+slop+" query: "+query+" doc: "+doc+" Wrong number of hits", expectedNumResults, c.totalHits); //QueryUtils.check(query,searcher); writer.close(); reader.close(); ramDir.close(); // returns the max Scorer.freq() found, because even though norms are omitted, many index stats are different // with these different tokens/distributions/lengths.. otherwise this test is very fragile. return c.max; }
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { CharArraySet dict = makeDictionary("ab", "cd", "ef"); Tokenizer tokenizer = new MockTokenizer(new StringReader("abcdef"), MockTokenizer.WHITESPACE, false); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( new WhitespaceTokenizer( new StringReader( "abcdef") ), dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); assertTokenStreamContents(tf, new String[] { "abcdef", "ab", "cd", "ef" }, new int[] { 0, 0, 0, 0}, new int[] { 6, 6, 6, 6}, new int[] { 1, 0, 0, 0} ); }
@BeforeClass public static void beforeClass() throws Exception { String[] data = new String[] { "A 1 2 3 4 5 6", "Z 4 5 6", null, "B 2 4 5 6", "Y 3 5 6", null, "C 3 6", "X 4 5 6" }; small = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), small, newIndexWriterConfig( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)).setMergePolicy(newLogMergePolicy())); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setTokenized(false); for (int i = 0; i < data.length; i++) { Document doc = new Document(); doc.add(newField("id", String.valueOf(i), customType));// Field.Keyword("id",String.valueOf(i))); doc.add(newField("all", "all", customType));// Field.Keyword("all","all")); if (null != data[i]) { doc.add(newTextField("data", data[i], Field.Store.YES));// Field.Text("data",data[i])); } writer.addDocument(doc); } reader = writer.getReader(); writer.close(); }
private Map<String,Float> getOriginalValues() throws IOException { Map<String,Float> originalValues = new HashMap<>(); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader( "lucene release")); List<BooleanClause> clauses = query.clauses(); for (BooleanClause clause : clauses) { TermQuery tq = (TermQuery) clause.getQuery(); originalValues.put(tq.getTerm().text(), tq.getBoost()); } return originalValues; }
public void testMultiValues() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene"), new StringReader("lucene release"), new StringReader("apache"), new StringReader("apache lucene")); List<BooleanClause> clauses = query.clauses(); assertEquals("Expected 2 clauses only!", 2, clauses.size()); for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term)); } }
public void testRandom2GraphAfter() throws Exception { final int numIters = atLeast(3); Random random = random(); for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random.nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream syns = new SynonymFilter(tokenizer, map, ignoreCase); TokenStream graph = new MockGraphTokenFilter(random(), syns); return new TokenStreamComponents(tokenizer, graph); } }; checkRandomData(random, analyzer, 100); } }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { final int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random().nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase); return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); } }; checkRandomData(random(), analyzer, 200); } }
public void testMultipleStopWordsEnd() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a"); TokenStream stream = new MockTokenizer(new StringReader("go to a the")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] { "go", "the"}, new int[] {0, 8}, new int[] {2, 11}, null, new int[] {1, 3}, null, 11, new boolean[] {false, true}, true); }
public void testFuzzySlopeExtendability() throws ParseException { QueryParser qp = new QueryParser("a", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)) { @Override Query handleBareFuzzy(String qfield, Token fuzzySlop, String termImage) throws ParseException { if(fuzzySlop.image.endsWith("€")) { float fms = fuzzyMinSim; try { fms = Float.valueOf(fuzzySlop.image.substring(1, fuzzySlop.image.length()-1)).floatValue(); } catch (Exception ignored) { } float value = Float.parseFloat(termImage); return getRangeQuery(qfield, Float.toString(value-fms/2.f), Float.toString(value+fms/2.f), true, true); } return super.handleBareFuzzy(qfield, fuzzySlop, termImage); } }; assertEquals(qp.parse("a:[11.95 TO 12.95]"), qp.parse("12.45~1€")); }
public void testBogusField() throws Exception { DirectSpellChecker spellChecker = new DirectSpellChecker(); Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); for (int i = 0; i < 20; i++) { Document doc = new Document(); doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO)); writer.addDocument(doc); } IndexReader ir = writer.getReader(); SuggestWord[] similar = spellChecker.suggestSimilar(new Term( "bogusFieldBogusField", "fvie"), 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(0, similar.length); ir.close(); writer.close(); dir.close(); }
public void testInvalidOffset() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new WordTokenFilter(filters); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mosfellsbaer" }, new int[] { 0 }, new int[] { 11 }); }
public void testSimple() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(t, new TruncateTokenFilter(t, 3)); } }; TokenStream test = analyzer.tokenStream("test", "a bb ccc dddd eeeee"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("a")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("bb")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("ccc")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("ddd")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("eee")); assertThat(test.incrementToken(), equalTo(false)); }
private void testThreadSafety(TokenFilterFactory factory) throws IOException { final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(); return new TokenStreamComponents(tokenizer, factory.create(tokenizer)); } }; BaseTokenStreamTestCase.checkRandomData(random(), analyzer, 100); }
@Test public void testUsingPackagedWordNetReader() throws IOException { Map<String, String> args = new HashMap<>(); LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args); StringReader reader = new StringReader("it better works"); final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); in.setReader(reader); TokenStream stream = factory.create(in); assertTokenStreamContents(stream, new String[] { "it", "good", "work" }); }
@Test public void testWithSamplePhrase() throws IOException { StringReader reader = new StringReader("it better works"); final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); in.setReader(reader); TokenStream stream = new LemmatizerFilter(in, new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new RTrie())); assertTokenStreamContents(stream, new String[] { "it", "good", "work" }); }
@Test public void testUsingPackagedWordNetReaderFromFilterFactory() throws IOException { Map<String, String> args = new HashMap<>(); LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args); StringReader reader = new StringReader("it better works"); final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); in.setReader(reader); TokenStream stream = factory.create(in); assertTokenStreamContents(stream, new String[] { "it", "good", "work" }); }
@Test public void testUsingDirectoryWordNetReaderWithDummyPathShouldFailSilently() throws IOException { Map<String, String> args = new HashMap<>(); args.put("dictPath", "/tmp"); LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args); StringReader reader = new StringReader("it better works"); final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); in.setReader(reader); TokenStream stream = factory.create(in); assertTokenStreamContents(stream, new String[] { "it", "better", "works" }); }
/** parse a syn file with some escaped syntax chars */ public void testEscapedStuff() throws Exception { String testFile = "a\\=>a => b\\=>b\n" + "a\\,a => b\\,b"; SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); parser.parse(new StringReader(testFile)); final SynonymMap map = parser.build(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); } }; assertAnalyzesTo(analyzer, "ball", new String[] { "ball" }, new int[] { 1 }); assertAnalyzesTo(analyzer, "a=>a", new String[] { "b=>b" }, new int[] { 1 }); assertAnalyzesTo(analyzer, "a,a", new String[] { "b,b" }, new int[] { 1 }); }
/** * Test EdgeNGramFilterFactory with min and max gram size */ public void testEdgeNGramFilter2() throws Exception { Reader reader = new StringReader("test"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream); assertTokenStreamContents(stream, new String[] { "t", "te" }); }
/** * Test EdgeNGramFilterFactory */ public void testEdgeNGramFilter() throws Exception { Reader reader = new StringReader("test"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("EdgeNGram").create(stream); assertTokenStreamContents(stream, new String[] { "t" }); }
public void testStripAll() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter (new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), Pattern.compile("a*b"), null, true); assertTokenStreamContents(ts, new String[] { "foofoofoo", "", "c" }); }
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new GermanStemFilter(sink)); } }; checkOneTerm(a, "sängerinnen", "sängerinnen"); }
/** * Test setting ignoreCase=true */ public void testCaseInsensitive() throws Exception { Reader reader = new StringReader("L'avion"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("Elision", "articles", "frenchArticles.txt", "ignoreCase", "true").create(stream); assertTokenStreamContents(stream, new String[] { "avion" }); }
@Override public void testEscapedVsQuestionMarkAsWildcard() throws Exception { Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); assertQueryEquals("a:b\\-?c", a, "a:b-?c"); assertQueryEquals("a:b\\+?c", a, "a:b+?c"); assertQueryEquals("a:b\\:?c", a, "a:b:?c"); assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); }
public void testWildcardInConstantScore() throws Exception { Directory dir = newDirectory(); // use simpleanalyzer for more natural tokenization (else "test." is a token) final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.add(body); body.setStringValue("This is a test."); iw.addDocument(doc); body.setStringValue("Test a one sentence document."); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher searcher = newSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighter() { @Override protected Analyzer getIndexAnalyzer(String field) { return analyzer; } }; ConstantScoreQuery query = new ConstantScoreQuery(new WildcardQuery(new Term("body", "te*"))); TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.totalHits); String snippets[] = highlighter.highlight("body", query, searcher, topDocs); assertEquals(2, snippets.length); assertEquals("This is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); ir.close(); dir.close(); }
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("энергии"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new RussianLightStemFilter(sink)); } }; checkOneTerm(a, "энергии", "энергии"); }
public void testSpanWildcard() throws Exception { Directory dir = newDirectory(); // use simpleanalyzer for more natural tokenization (else "test." is a token) final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.add(body); body.setStringValue("This is a test."); iw.addDocument(doc); body.setStringValue("Test a one sentence document."); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher searcher = newSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighter() { @Override protected Analyzer getIndexAnalyzer(String field) { return analyzer; } }; Query query = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*"))); TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.totalHits); String snippets[] = highlighter.highlight("body", query, searcher, topDocs); assertEquals(2, snippets.length); assertEquals("This is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); ir.close(); dir.close(); }
public void testSynonyms() throws Exception { WordnetSynonymParser parser = new WordnetSynonymParser(true, true, new MockAnalyzer(random())); parser.parse(new StringReader(synonymsFile)); final SynonymMap map = parser.build(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); } }; /* all expansions */ assertAnalyzesTo(analyzer, "Lost in the woods", new String[] { "Lost", "in", "the", "woods", "wood", "forest" }, new int[] { 0, 5, 8, 12, 12, 12 }, new int[] { 4, 7, 11, 17, 17, 17 }, new int[] { 1, 1, 1, 1, 0, 0 }); /* single quote */ assertAnalyzesTo(analyzer, "king", new String[] { "king", "baron" }); /* multi words */ assertAnalyzesTo(analyzer, "king's evil", new String[] { "king's", "king's", "evil", "meany" }); }
public void testPositionIncrements() throws Exception { Reader reader = new StringReader("foo foobar super-duper-trooper"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("CodepointCount", "min", "4", "max", "10").create(stream); assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 }); }
/** Test that invalid arguments result in exception */ public void testInvalidArguments() throws Exception { try { Reader reader = new StringReader("foo foobar super-duper-trooper"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); tokenFilterFactory("Length", LengthFilterFactory.MIN_KEY, "5", LengthFilterFactory.MAX_KEY, "4").create(stream); fail(); } catch (IllegalArgumentException expected) { assertTrue(expected.getMessage().contains("maximum length must not be greater than minimum length")); } }
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new PortugueseStemFilter(sink)); } }; checkOneTerm(a, "quilométricas", "quilométricas"); }