/** * Initializes the tests by adding documents to the index. */ @Override public void setUp() throws Exception { super.setUp(); // create test index final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)) .setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy()) .setSimilarity(new DefaultSimilarity())); addDocument(writer, "A", "Should we, could we, would we?"); addDocument(writer, "B", "It should. Should it?"); addDocument(writer, "C", "It shouldn't."); addDocument(writer, "D", "Should we, should we, should we."); reader2 = writer.getReader(); writer.close(); // re-open the searcher since we added more docs searcher2 = newSearcher(reader2); searcher2.setSimilarity(new DefaultSimilarity()); }
/** * Initializes the tests by adding 4 identical documents to the index. */ @Override public void setUp() throws Exception { super.setUp(); // create test index mDirectory = newDirectory(); final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)) .setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity())); addDocument(writer, "1", "I think it should work."); addDocument(writer, "2", "I think it should work."); addDocument(writer, "3", "I think it should work."); addDocument(writer, "4", "I think it should work."); reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); searcher.setSimilarity(new DefaultSimilarity()); }
public void testStopwordsPosIncHole() throws Exception { Directory dir = newDirectory(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader); TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET); return new TokenStreamComponents(tokenizer, stream); } }; RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a); Document doc = new Document(); doc.add(new TextField("body", "just a", Field.Store.NO)); doc.add(new TextField("body", "test of gaps", Field.Store.NO)); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher is = newSearcher(ir); PhraseQuery pq = new PhraseQuery(); pq.add(new Term("body", "just"), 0); pq.add(new Term("body", "test"), 2); // body:"just ? test" assertEquals(1, is.search(pq, 5).totalHits); ir.close(); dir.close(); }
/** * This method intended for use with * <tt>testHighlightingWithDefaultField()</tt> */ private String highlightField(Query query, String fieldName, String text) throws IOException, InvalidTokenOffsetsException { TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName, new StringReader(text)); // Assuming "<B>", "</B>" used to highlight SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); MyQueryScorer scorer = new MyQueryScorer(query, fieldName, FIELD_NAME); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)"); return rv.length() == 0 ? text : rv; }
/** * Initializes the tests by adding documents to the index. */ @Override public void setUp() throws Exception { super.setUp(); // create test index final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)) .setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy()) .setSimilarity(new DefaultSimilarity())); addDocument(writer, "A", "Should we, could we, would we?"); addDocument(writer, "B", "It should. Should it?"); addDocument(writer, "C", "It shouldn't."); addDocument(writer, "D", "Should we, should we, should we."); reader2 = writer.getReader(); writer.close(); // re-open the searcher since we added more docs searcher2 = newSearcher(reader2); searcher2.setSimilarity(new DefaultSimilarity()); }
/** * Initializes the tests by adding 4 identical documents to the index. */ @Override public void setUp() throws Exception { super.setUp(); // create test index mDirectory = newDirectory(); final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)) .setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity())); addDocument(writer, "1", "I think it should work."); addDocument(writer, "2", "I think it should work."); addDocument(writer, "3", "I think it should work."); addDocument(writer, "4", "I think it should work."); reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); searcher.setSimilarity(new DefaultSimilarity()); }
@Override public void testPositionIncrement() throws Exception { //For SQP, this only tests whether stop words have been dropped. //PositionIncrements are not available in SpanQueries yet. CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); //qp.setEnablePositionIncrements(true); String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; // 0 2 5 7 8 SpanNearQuery pq = (SpanNearQuery) getQuery(qtxt,qp); SpanQuery[] clauses = pq.getClauses(); assertEquals(clauses.length, 5); Set<Term> expected = new HashSet<Term>(); expected.add(new Term("field", "words")); expected.add(new Term("field", "poisitions")); expected.add(new Term("field", "pos")); expected.add(new Term("field", "stopped")); expected.add(new Term("field", "phrasequery")); }
public void testBoost() throws Exception { CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer); Query q = getQuery("on^1.0",qp); assertNotNull(q); q = getQuery("\"hello\"^2.0",qp); assertNotNull(q); assertEquals(getBoost(q), (float) 2.0, (float) 0.5); q = getQuery("hello^2.0",qp); assertNotNull(q); assertEquals(((BoostQuery)q).getBoost(), (float) 2.0, (float) 0.5); q = getQuery("\"on\"^1.0",qp); assertNotNull(q); Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); CommonQueryParserConfiguration qp2 = getParserConfig(a2); q = getQuery("the^3", qp2); // "the" is a stop word so the result is an empty query: assertNotNull(q); assertEmpty(q); assertEquals(1.0f, getBoost(q), 0.01f); }
public void testPositionIncrement() throws Exception { CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); qp.setEnablePositionIncrements(true); String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; // 0 2 5 7 8 int expectedPositions[] = {1,3,4,6,9}; PhraseQuery pq = (PhraseQuery) getQuery(qtxt,qp); //System.out.println("Query text: "+qtxt); //System.out.println("Result: "+pq); Term t[] = pq.getTerms(); int pos[] = pq.getPositions(); for (int i = 0; i < t.length; i++) { //System.out.println(i+". "+t[i]+" pos: "+pos[i]); assertEquals("term "+i+" = "+t[i]+" has wrong term-position!",expectedPositions[i],pos[i]); } }
public void testPositionIncrements() throws Exception { Directory dir = newDirectory(); Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(a)); Document doc = new Document(); doc.add(newTextField("field", "the wizard of ozzy", Field.Store.NO)); w.addDocument(doc); IndexReader r = DirectoryReader.open(w); w.close(); IndexSearcher s = newSearcher(r); Query q = getQuery("\"wizard of ozzy\"",a); assertEquals(1, s.search(q, 1).totalHits); r.close(); dir.close(); }
@Test public void testMismatchingFieldsInStandardQueryConversion() throws Exception { // tests what happens if a Query doesn't contain a term in the "span" field // in the searcher...should be no exception and zero documents returned. String[] docs = new String[]{"a b c a b c",}; Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); Query q = new TermQuery(new Term("_" + FIELD, "a")); int windowCount = -1; ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); windowCount = collector.size(); assertEquals(0, windowCount); reader.close(); directory.close(); }
/** * Initializes the tests by adding documents to the index. */ @Override public void setUp() throws Exception { super.setUp(); // create test index final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)) .setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy()) .setSimilarity(new DefaultSimilarity())); addDocument(writer, "A", "Should we, could we, would we?"); addDocument(writer, "B", "It should. Should it?"); addDocument(writer, "C", "It shouldn't."); addDocument(writer, "D", "Should we, should we, should we."); reader2 = writer.getReader(); writer.close(); // re-open the searcher since we added more docs searcher2 = newSearcher(reader2); searcher2.setSimilarity(new DefaultSimilarity()); }
/** * Initializes the tests by adding 4 identical documents to the index. */ @Override public void setUp() throws Exception { super.setUp(); // create test index mDirectory = newDirectory(); final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)) .setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity())); addDocument(writer, "1", "I think it should work."); addDocument(writer, "2", "I think it should work."); addDocument(writer, "3", "I think it should work."); addDocument(writer, "4", "I think it should work."); reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); searcher.setSimilarity(new DefaultSimilarity()); }
/** * This method intended for use with * <tt>testHighlightingWithDefaultField()</tt> */ private String highlightField(Query query, String fieldName, String text) throws IOException, InvalidTokenOffsetsException { TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName, text); // Assuming "<B>", "</B>" used to highlight SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); MyQueryScorer scorer = new MyQueryScorer(query, fieldName, FIELD_NAME); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)"); return rv.length() == 0 ? text : rv; }
/** * Return a random analyzer (Simple, Stop, Standard) to analyze the terms. */ private Analyzer randomAnalyzer() { switch(random().nextInt(4)) { case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); case 2: return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader); return new TokenStreamComponents(tokenizer, new CrazyTokenFilter(tokenizer)); } }; default: return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); } }
/** * basic "standardanalyzer" test with stopword removal */ public void testStandard() throws Exception { Input keys[] = new Input[] { new Input("the ghost of christmas past", 50), }; Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); FuzzySuggester suggester = new FuzzySuggester(standard, standard, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE); suggester.build(new InputArrayIterator(keys)); List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' since its a stopword, its suggested anyway results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' and 'of' since they are stopwords, its suggested anyway results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); }
/** * basic "standardanalyzer" test with stopword removal */ public void testStandard() throws Exception { Input keys[] = new Input[] { new Input("the ghost of christmas past", 50), }; Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); AnalyzingSuggester suggester = new AnalyzingSuggester(standard, standard, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false); suggester.build(new InputArrayIterator(keys)); List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' since its a stopword, its suggested anyway results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' and 'of' since they are stopwords, its suggested anyway results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); }
public void testEmpty() throws Exception { Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); AnalyzingSuggester suggester = new AnalyzingSuggester(standard); suggester.build(new InputArrayIterator(new Input[0])); List<LookupResult> result = suggester.lookup("a", false, 20); assertTrue(result.isEmpty()); }
public void testMissingPayload() throws Exception { Directory dir = newDirectory(); // MockAnalyzer minus maybePayload else it sometimes stuffs in an 8-byte payload! Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true, 100); tokenizer.setEnableChecks(true); MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET); return new TokenStreamComponents(tokenizer, filt); } }; IndexWriterConfig iwc = newIndexWriterConfig(a); iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat())); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); doc.add(newTextField("id", "id", Field.Store.NO)); try { w.addDocument(doc); w.commit(); fail("didn't hit expected exception"); } catch (IllegalArgumentException iae) { // expected } w.close(); dir.close(); }
@Override public Object create(Random random) { // TODO: could probably use a purely random automaton switch(random.nextInt(5)) { case 0: return MockTokenizer.KEYWORD; case 1: return MockTokenizer.SIMPLE; case 2: return MockTokenizer.WHITESPACE; case 3: return MockTokenFilter.EMPTY_STOPSET; default: return MockTokenFilter.ENGLISH_STOPSET; } }
@BeforeClass public static void beforeClass() throws Exception { // TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT): Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); //initialize the parser builder = new CorePlusExtensionsParser("contents", analyzer); BufferedReader d = new BufferedReader(new InputStreamReader( TestParser.class.getResourceAsStream("reuters21578.txt"), StandardCharsets.US_ASCII)); dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)); String line = d.readLine(); while (line != null) { int endOfDate = line.indexOf('\t'); String date = line.substring(0, endOfDate).trim(); String content = line.substring(endOfDate).trim(); Document doc = new Document(); doc.add(newTextField("date", date, Field.Store.YES)); doc.add(newTextField("contents", content, Field.Store.YES)); doc.add(new IntField("date2", Integer.valueOf(date), Field.Store.NO)); writer.addDocument(doc); line = d.readLine(); } d.close(); writer.close(); reader = DirectoryReader.open(dir); searcher = newSearcher(reader); }
public void testEndOffsetPositionStopFilter() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "abcd the", customType); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(9, dpEnum.startOffset()); assertEquals(13, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); dir.close(); }
/** * Return a random analyzer (Simple, Stop, Standard) to analyze the terms. */ private Analyzer randomAnalyzer() { switch(random().nextInt(3)) { case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); default: return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); } }
/** * basic "standardanalyzer" test with stopword removal */ public void testStandard() throws Exception { TermFreq keys[] = new TermFreq[] { new TermFreq("the ghost of christmas past", 50), }; Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); FuzzySuggester suggester = new FuzzySuggester(standard); suggester.build(new TermFreqArrayIterator(keys)); List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' since its a stopword, its suggested anyway results = suggester.lookup(_TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' and 'of' since they are stopwords, its suggested anyway results = suggester.lookup(_TestUtil.stringToCharSequence("ghost chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); }
/** * basic "standardanalyzer" test with stopword removal */ public void testStandard() throws Exception { TermFreq keys[] = new TermFreq[] { new TermFreq("the ghost of christmas past", 50), }; Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); AnalyzingSuggester suggester = new AnalyzingSuggester(standard); suggester.build(new TermFreqArrayIterator(keys)); List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' since its a stopword, its suggested anyway results = suggester.lookup(_TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' and 'of' since they are stopwords, its suggested anyway results = suggester.lookup(_TestUtil.stringToCharSequence("ghost chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); }
public void testEmpty() throws Exception { Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); AnalyzingSuggester suggester = new AnalyzingSuggester(standard); suggester.build(new TermFreqArrayIterator(new TermFreq[0])); List<LookupResult> result = suggester.lookup("a", false, 20); assertTrue(result.isEmpty()); }
@BeforeClass public static void beforeClass() throws Exception { // TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT): Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); //initialize the parser builder = new CorePlusExtensionsParser("contents", analyzer); BufferedReader d = new BufferedReader(new InputStreamReader( TestParser.class.getResourceAsStream("reuters21578.txt"), "US-ASCII")); dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(Version.LUCENE_40, analyzer)); String line = d.readLine(); while (line != null) { int endOfDate = line.indexOf('\t'); String date = line.substring(0, endOfDate).trim(); String content = line.substring(endOfDate).trim(); Document doc = new Document(); doc.add(newTextField("date", date, Field.Store.YES)); doc.add(newTextField("contents", content, Field.Store.YES)); doc.add(new IntField("date2", Integer.valueOf(date), Field.Store.NO)); writer.addDocument(doc); line = d.readLine(); } d.close(); writer.close(); reader = DirectoryReader.open(dir); searcher = newSearcher(reader); }
public void testEndOffsetPositionStopFilter() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "abcd the", customType); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(9, dpEnum.startOffset()); assertEquals(13, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); dir.close(); }
public void testPhraseQueryToString() throws Exception { Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); CommonQueryParserConfiguration qp = getParserConfig(analyzer); qp.setEnablePositionIncrements(true); PhraseQuery q = (PhraseQuery)getQuery("\"this hi this is a test is\"", qp); assertEquals("field:\"? hi ? ? ? test\"", q.toString()); }
@Test public void testWithStops() throws Exception { String[] docs = new String[]{"a b the d the f", "b c the d the e"}; Analyzer analyzer = getAnalyzer( MockTokenFilter.ENGLISH_STOPSET, 50, 100); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); IDFIndexCalc idfer = new IDFIndexCalc(reader); CooccurVisitor visitor = new CooccurVisitor( FIELD, 10, 10, new WGrammer(1, 1, false), idfer, 100, true); ((CooccurVisitor) visitor).setMinTermFreq(0); ConcordanceArrayWindowSearcher searcher = new ConcordanceArrayWindowSearcher(); SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); searcher.search(indexSearcher, FIELD, q, null, analyzer, visitor, new IndexIdDocIdBuilder()); List<TermIDF> results = ((CooccurVisitor) visitor).getResults(); Map<String, Integer> truth = new HashMap<String, Integer>(); truth.put("b", 2); truth.put("c", 1); truth.put("e", 1); truth.put("f", 1); assertEquals(truth.size(), results.size()); for (TermIDF r : results) { assertEquals(r.getTerm(), truth.get(r.getTerm()).intValue(), r.getTermFreq()); } reader.close(); directory.close(); }
@Test public void testWindowLengths() throws Exception { String[] doc = new String[]{"a b c d e f g"}; List<String[]> docs = new ArrayList<>(); docs.add(doc); Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); String[] pres = {"", "c", "b c", "a b c", "a b c", "a b c"}; String[] posts = {"", " e", " e f", " e f g", " e f g", " e f g"}; for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) { for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) { WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter, analyzer.getOffsetGap(FIELD)); ConcordanceSearcher searcher = new ConcordanceSearcher(wb); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); ConcordanceWindow w = collector.getSortedWindows().get(0); assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre()); assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost()); } } reader.close(); directory.close(); }
@Test public void testWithStops() throws Exception { String[] docs = new String[]{"a b the d e the f", "g h the d the j"}; Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD)); ConcordanceSearcher searcher = new ConcordanceSearcher(wb); SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); List<ConcordanceWindow> windows = collector.getSortedWindows(); assertEquals(2, windows.size()); // the second word after the target is a stop word // this post-component of this window should only go to the first word after // the target assertEquals("b the", windows.get(0).getPre()); assertEquals("d", windows.get(0).getTarget()); assertEquals(" e", windows.get(0).getPost()); assertEquals("h the", windows.get(1).getPre()); assertEquals("d", windows.get(1).getTarget()); assertEquals(" the j", windows.get(1).getPost()); reader.close(); directory.close(); }
@Test public void testBasicStandardQueryConversion() throws Exception { String[] docs = new String[]{"a b c a b c", "c b a c b a d e a", "c b a c b a e a b c a"}; Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); BooleanQuery q = new BooleanQuery.Builder() .add(new TermQuery(new Term(FIELD, "a")), Occur.MUST) .add(new TermQuery(new Term(FIELD, "d")), Occur.MUST_NOT).build(); ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, null, analyzer, collector); // shouldn't include document with "d" assertEquals(6, collector.size()); // should only include document with "e" and not "d" Query filter = new TermQuery(new Term( FIELD, "e")); collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, (Query) q, filter, analyzer, collector); assertEquals(4, collector.size()); reader.close(); directory.close(); }
@Test public void testUniqueCollector() throws Exception { String[] docs = new String[]{"a b c d c b a", "a B C d c b a", "a b C d C B a", "a b c d C B A", "e f g d g f e", "h i j d j i h" }; Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(2); searcher.search(indexSearcher, FIELD, (Query) q, null, analyzer, collector); assertEquals(2, collector.size()); collector = new DedupingConcordanceWindowCollector(AbstractConcordanceWindowCollector.COLLECT_ALL); searcher.search(indexSearcher, FIELD, (Query) q, null, analyzer, collector); assertEquals(3, collector.size()); reader.close(); directory.close(); }
@Test public void testUniqueCollectorWithSameWindowOverflow() throws Exception { String[] docs = new String[]{"a b c d c b a", "a b c d c b a", "a b c d c b a", "a b c d c b a", "e f g d g f e", "h i j d j i h" }; Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(3); searcher.search(indexSearcher, FIELD, (Query) q, null, analyzer, collector); assertEquals(3, collector.size()); assertEquals(4, collector.getSortedWindows().get(0).getCount()); reader.close(); directory.close(); }
@Test public void testRewrites() throws Exception { //test to make sure that queries are rewritten //first test straight prefix queries String[] docs = new String[]{"aa ba ca aa ba ca", "ca ba aa ca ba aa da ea za", "ca ba aa ca ba aa ea aa ba ca za"}; Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); BooleanQuery q = new BooleanQuery.Builder() .add(new PrefixQuery(new Term(FIELD, "a")), Occur.MUST) .add(new PrefixQuery(new Term(FIELD, "d")), Occur.MUST_NOT).build(); //now test straight and span wrapper ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, new PrefixQuery(new Term(FIELD, "z")), analyzer, collector); // shouldn't include document with "da", but must include one with za assertEquals(3, collector.size()); collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, new SpanMultiTermQueryWrapper<>(new PrefixQuery(new Term(FIELD, "z"))), analyzer, collector); // shouldn't include document with "da", but must include one with za assertEquals(3, collector.size()); reader.close(); directory.close(); }
/** * basic "standardanalyzer" test with stopword removal */ public void testStandard() throws Exception { Input keys[] = new Input[] { new Input("the ghost of christmas past", 50), }; Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); FuzzySuggester suggester = new FuzzySuggester(standard, standard, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE); suggester.build(new InputArrayIterator(keys)); List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' since its a stopword, its suggested anyway results = suggester.lookup(_TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // omit the 'the' and 'of' since they are stopwords, its suggested anyway results = suggester.lookup(_TestUtil.stringToCharSequence("ghost chris", random()), false, 1); assertEquals(1, results.size()); assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); }