public void testCountPositions() throws IOException { // We're looking to make sure that we: Token t1 = new Token(); // Don't count tokens without an increment t1.setPositionIncrement(0); Token t2 = new Token(); t2.setPositionIncrement(1); // Count normal tokens with one increment Token t3 = new Token(); t2.setPositionIncrement(2); // Count funny tokens with more than one increment int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them Token[] tokens = new Token[] {t1, t2, t3}; Collections.shuffle(Arrays.asList(tokens), random()); final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); // TODO: we have no CannedAnalyzer? Analyzer analyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(new MockTokenizer(), tokenStream); } }; assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7)); }
public void testBogusTermVectors() throws IOException { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); Field field = new Field("foo", "", ft); field.setTokenStream(new CannedTokenStream( new Token("bar", 5, 10), new Token("bar", 1, 4) )); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); // checkindex }
public void testIllegalPositions() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); Token t1 = new Token("foo", 0, 3); t1.setPositionIncrement(Integer.MAX_VALUE); Token t2 = new Token("bar", 4, 7); t2.setPositionIncrement(200); TokenStream overflowingTokenStream = new CannedTokenStream( new Token[] { t1, t2 } ); Field field = new TextField("foo", overflowingTokenStream); doc.add(field); try { iw.addDocument(doc); fail(); } catch (IllegalArgumentException expected) { // expected exception } iw.close(); dir.close(); }
public void testLegalbutVeryLargePositions() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); Token t1 = new Token("foo", 0, 3); t1.setPositionIncrement(Integer.MAX_VALUE-500); if (random().nextBoolean()) { t1.setPayload(new BytesRef(new byte[] { 0x1 } )); } TokenStream overflowingTokenStream = new CannedTokenStream( new Token[] { t1 } ); Field field = new TextField("foo", overflowingTokenStream); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); }
public void testLegalbutVeryLargeOffsets() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500); if (random().nextBoolean()) { t1.setPayload(new BytesRef("test")); } Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE); TokenStream tokenStream = new CannedTokenStream( new Token[] { t1, t2 } ); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Field field = new Field("foo", tokenStream, ft); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); }
private void checkTokens(Token[] field1, Token[] field2) throws IOException { Directory dir = newDirectory(); RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc); boolean success = false; try { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Document doc = new Document(); doc.add(new Field("body", new CannedTokenStream(field1), ft)); doc.add(new Field("body", new CannedTokenStream(field2), ft)); riw.addDocument(doc); riw.close(); success = true; } finally { if (success) { IOUtils.close(dir); } else { IOUtils.closeWhileHandlingException(riw, dir); } } }
private void checkTokens(Token[] tokens) throws IOException { Directory dir = newDirectory(); RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc); boolean success = false; try { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Document doc = new Document(); doc.add(new Field("body", new CannedTokenStream(tokens), ft)); riw.addDocument(doc); success = true; } finally { if (success) { IOUtils.close(riw, dir); } else { IOUtils.closeWhileHandlingException(riw, dir); } } }
public void testTextFieldString() throws Exception { Field fields[] = new Field[] { new TextField("foo", "bar", Field.Store.NO), new TextField("foo", "bar", Field.Store.YES) }; for (Field field : fields) { field.setBoost(5f); trySetByteValue(field); trySetBytesValue(field); trySetBytesRefValue(field); trySetDoubleValue(field); trySetIntValue(field); trySetFloatValue(field); trySetLongValue(field); trySetReaderValue(field); trySetShortValue(field); field.setStringValue("baz"); field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); assertEquals("baz", field.stringValue()); assertEquals(5f, field.boost(), 0f); } }
public void testTextFieldReader() throws Exception { Field field = new TextField("foo", new StringReader("bar")); field.setBoost(5f); trySetByteValue(field); trySetBytesValue(field); trySetBytesRefValue(field); trySetDoubleValue(field); trySetIntValue(field); trySetFloatValue(field); trySetLongValue(field); field.setReaderValue(new StringReader("foobar")); trySetShortValue(field); trySetStringValue(field); field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); assertNotNull(field.readerValue()); assertEquals(5f, field.boost(), 0f); }
public void testBogusTermVectors() throws IOException { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, null)); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); Field field = new Field("foo", "", ft); field.setTokenStream(new CannedTokenStream( new Token("bar", 5, 10), new Token("bar", 1, 4) )); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); // checkindex }
public void testLegalbutVeryLargeOffsets() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, null)); Document doc = new Document(); Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500); if (random().nextBoolean()) { t1.setPayload(new BytesRef("test")); } Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE); TokenStream tokenStream = new CannedTokenStream( new Token[] { t1, t2 } ); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Field field = new Field("foo", tokenStream, ft); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); }
public void testIllegalPositions() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, null)); Document doc = new Document(); Token t1 = new Token("foo", 0, 3); t1.setPositionIncrement(Integer.MAX_VALUE); Token t2 = new Token("bar", 4, 7); t2.setPositionIncrement(200); TokenStream overflowingTokenStream = new CannedTokenStream( new Token[] { t1, t2 } ); Field field = new TextField("foo", overflowingTokenStream); doc.add(field); try { iw.addDocument(doc); fail(); } catch (IllegalArgumentException expected) { // expected exception } iw.close(); dir.close(); }
public void testLegalbutVeryLargePositions() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, null)); Document doc = new Document(); Token t1 = new Token("foo", 0, 3); t1.setPositionIncrement(Integer.MAX_VALUE-500); if (random().nextBoolean()) { t1.setPayload(new BytesRef(new byte[] { 0x1 } )); } TokenStream overflowingTokenStream = new CannedTokenStream( new Token[] { t1 } ); Field field = new TextField("foo", overflowingTokenStream); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); }
public void testBasic() throws IOException { Index index = new Index("test", "_na_"); String name = "ngr"; Settings indexSettings = newAnalysisSettingsBuilder().build(); IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); Settings settings = newAnalysisSettingsBuilder().build(); // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input: TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wtf", 1, 5, 0, 3), token("what", 0, 1, 0, 3), token("wow", 0, 3, 0, 3), token("the", 1, 1, 0, 3), token("fudge", 1, 3, 0, 3), token("that's", 1, 1, 0, 3), token("funny", 1, 1, 0, 3), token("happened", 1, 1, 4, 12) }); TokenStream tokens = new FlattenGraphTokenFilterFactory(indexProperties, null, name, settings).create(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(tokens, new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"}, new int[] {0, 0, 0, 0, 0, 0, 0, 4}, new int[] {3, 3, 3, 3, 3, 3, 3, 12}, new int[] {1, 0, 0, 1, 0, 1, 0, 1}, new int[] {3, 1, 1, 1, 1, 1, 1, 1}, 12); }
private URLTokenFilter createFilter(final String url, final URLPart part, final boolean urlDecode, final boolean allowMalformed) { int length = 0; if (url != null) { length = url.length(); } return new URLTokenFilter(new CannedTokenStream(new Token(url, 0, length)), part, urlDecode, allowMalformed); }
public void testEmptyString() throws IOException { MemoryIndex memory = new MemoryIndex(); memory.addField("foo", new CannedTokenStream(new Token("", 0, 5))); IndexSearcher searcher = memory.createSearcher(); TopDocs docs = searcher.search(new TermQuery(new Term("foo", "")), 10); assertEquals(1, docs.totalHits); }
public void testAnyFromTokenStream() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); doc.add(newTextField("field", "here comes the sun", Field.Store.NO)); w.addDocument(doc); doc = new Document(); doc.add(newTextField("field", "here comes the moon", Field.Store.NO)); w.addDocument(doc); doc = new Document(); doc.add(newTextField("field", "here comes sun", Field.Store.NO)); w.addDocument(doc); // Should not match: doc = new Document(); doc.add(newTextField("field", "here comes the other sun", Field.Store.NO)); w.addDocument(doc); IndexReader r = w.getReader(); IndexSearcher s = newSearcher(r); TokenStream ts = new CannedTokenStream(new Token[] { token("comes", 1, 1), token("comes", 0, 2), token("*", 1, 1), token("sun", 1, 1), token("moon", 0, 1) }); TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts); // System.out.println("DOT: " + q.toDot()); assertEquals(3, s.search(q, 1).totalHits); w.close(); r.close(); dir.close(); }
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, boolean outputUnigrams) throws IOException { ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), maxSize); filter.setOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); }
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, boolean outputUnigrams) throws IOException { ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.setOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); }
protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, boolean outputUnigrams, boolean outputUnigramsIfNoShingles) throws IOException { ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); }
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, boolean outputUnigrams) throws IOException { ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.setTokenSeparator(tokenSeparator); filter.setOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); }
public void testTrailingHole1() throws IOException { // Analyzing "wizard of", where of is removed as a // stopword leaving a trailing hole: Token[] inputTokens = new Token[] {createToken("wizard", 0, 6)}; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 9, inputTokens), 2, 2); assertTokenStreamContents(filter, new String[] {"wizard", "wizard _"}, new int[] {0, 0}, new int[] {6, 9}, new int[] {1, 0}, 9); }
public void testTrailingHole2() throws IOException { // Analyzing "purple wizard of", where of is removed as a // stopword leaving a trailing hole: Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)}; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2); assertTokenStreamContents(filter, new String[] {"purple", "purple wizard", "wizard", "wizard _"}, new int[] {0, 0, 7, 7}, new int[] {6, 13, 13, 16}, new int[] {1, 0, 1, 0}, 16); }
public void testTwoTrailingHoles() throws IOException { // Analyzing "purple wizard of the", where of and the are removed as a // stopwords, leaving two trailing holes: Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)}; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2); assertTokenStreamContents(filter, new String[] {"purple", "purple wizard", "wizard", "wizard _"}, new int[] {0, 0, 7, 7}, new int[] {6, 13, 13, 20}, new int[] {1, 0, 1, 0}, 20); }
public void testTwoTrailingHolesTriShingle() throws IOException { // Analyzing "purple wizard of the", where of and the are removed as a // stopwords, leaving two trailing holes: Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)}; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); assertTokenStreamContents(filter, new String[] {"purple", "purple wizard", "purple wizard _", "wizard", "wizard _", "wizard _ _"}, new int[] {0, 0, 0, 7, 7, 7}, new int[] {6, 13, 20, 13, 20, 20}, new int[] {1, 0, 0, 1, 0, 0}, 20); }
private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException { Directory dir = newDirectory(); // random dir IndexWriterConfig cfg = newIndexWriterConfig(null); IndexWriter writer = new IndexWriter(dir, cfg); Document doc = new Document(); doc.add(new TextField("field", new CannedTokenStream(INCR_0_DOC_TOKENS))); writer.addDocument(doc); IndexReader r = DirectoryReader.open(writer,false); writer.close(); IndexSearcher s = newSearcher(r); if (VERBOSE) { System.out.println("QUERY=" + q); } TopDocs hits = s.search(q, 1); assertEquals("wrong number of results", nExpected, hits.totalHits); if (VERBOSE) { for(int hit=0;hit<hits.totalHits;hit++) { ScoreDoc sd = hits.scoreDocs[hit]; System.out.println(" hit doc=" + sd.doc + " score=" + sd.score); } } r.close(); dir.close(); }
private void trySetTokenStreamValue(Field f) { try { f.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); fail(); } catch (IllegalArgumentException expected) { // expected } }
private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException { Directory dir = newDirectory(); // random dir IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, null); IndexWriter writer = new IndexWriter(dir, cfg); Document doc = new Document(); doc.add(new TextField("field", new CannedTokenStream(INCR_0_DOC_TOKENS))); writer.addDocument(doc); IndexReader r = DirectoryReader.open(writer,false); writer.close(); IndexSearcher s = new IndexSearcher(r); if (VERBOSE) { System.out.println("QUERY=" + q); } TopDocs hits = s.search(q, 1); assertEquals("wrong number of results", nExpected, hits.totalHits); if (VERBOSE) { for(int hit=0;hit<hits.totalHits;hit++) { ScoreDoc sd = hits.scoreDocs[hit]; System.out.println(" hit doc=" + sd.doc + " score=" + sd.score); } } r.close(); dir.close(); }