/** * populates a writer with random stuff. this must be fully reproducable with * the seed! */ public static void createRandomIndex(int numdocs, RandomIndexWriter writer, long seed) throws IOException { Random random = new Random(seed); // primary source for our data is from linefiledocs, its realistic. LineFileDocs lineFileDocs = new LineFileDocs(random, false); // no docvalues in 4x // TODO: we should add other fields that use things like docs&freqs but omit // positions, // because linefiledocs doesn't cover all the possibilities. for (int i = 0; i < numdocs; i++) { writer.addDocument(lineFileDocs.nextDoc()); } lineFileDocs.close(); }
public void buildIndex(Directory dir) throws IOException { Random random = random(); MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig config = newIndexWriterConfig(analyzer); Similarity provider = new MySimProvider(); config.setSimilarity(provider); RandomIndexWriter writer = new RandomIndexWriter(random, dir, config); final LineFileDocs docs = new LineFileDocs(random, defaultCodecSupportsDocValues()); int num = atLeast(100); for (int i = 0; i < num; i++) { Document doc = docs.nextDoc(); int boost = random().nextInt(255); Field f = new TextField(byteTestField, "" + boost, Field.Store.YES); f.setBoost(boost); doc.add(f); writer.addDocument(doc); doc.removeField(byteTestField); if (rarely()) { writer.commit(); } } writer.commit(); writer.close(); docs.close(); }
public void buildIndex(Directory dir) throws IOException { Random random = random(); IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); Similarity provider = new MySimProvider(); config.setSimilarity(provider); RandomIndexWriter writer = new RandomIndexWriter(random, dir, config); final LineFileDocs docs = new LineFileDocs(random, defaultCodecSupportsDocValues()); int num = atLeast(100); for (int i = 0; i < num; i++) { Document doc = docs.nextDoc(); int boost = random().nextInt(255); Field f = new TextField(byteTestField, "" + boost, Field.Store.YES); f.setBoost(boost); doc.add(f); writer.addDocument(doc); doc.removeField(byteTestField); if (rarely()) { writer.commit(); } } writer.commit(); writer.close(); docs.close(); }
/** * populates a writer with random stuff. this must be fully reproducable with the seed! */ public static void createRandomIndex(int numdocs, RandomIndexWriter writer, long seed) throws IOException { Random random = new Random(seed); // primary source for our data is from linefiledocs, its realistic. LineFileDocs lineFileDocs = new LineFileDocs(random); // TODO: we should add other fields that use things like docs&freqs but omit positions, // because linefiledocs doesn't cover all the possibilities. for (int i = 0; i < numdocs; i++) { Document document = lineFileDocs.nextDoc(); // grab the title and add some SortedSet instances for fun String title = document.get("titleTokenized"); String split[] = title.split("\\s+"); for (String trash : split) { document.add(new SortedSetDocValuesField("sortedset", new BytesRef(trash))); } writer.addDocument(document); } lineFileDocs.close(); }
/** * populates a writer with random stuff. this must be fully reproducable with the seed! */ public static void createRandomIndex(int numdocs, RandomIndexWriter writer, long seed) throws IOException { Random random = new Random(seed); // primary source for our data is from linefiledocs, its realistic. LineFileDocs lineFileDocs = new LineFileDocs(random); // TODO: we should add other fields that use things like docs&freqs but omit positions, // because linefiledocs doesn't cover all the possibilities. for (int i = 0; i < numdocs; i++) { Document document = lineFileDocs.nextDoc(); // grab the title and add some SortedSet instances for fun String title = document.get("titleTokenized"); String split[] = title.split("\\s+"); for (String trash : split) { document.add(new SortedSetDocValuesField("sortedset", new BytesRef(trash))); } // add a numeric dv field sometimes document.removeFields("sparsenumeric"); if (random.nextInt(4) == 2) { document.add(new NumericDocValuesField("sparsenumeric", random.nextInt())); } writer.addDocument(document); } lineFileDocs.close(); }
public void testTranslogOpsCountIsCorrect() throws IOException { List<Translog.Location> locations = new ArrayList<>(); int numOps = randomIntBetween(100, 200); LineFileDocs lineFileDocs = new LineFileDocs(random()); // writes pretty big docs so we cross buffer boarders regularly for (int opsAdded = 0; opsAdded < numOps; opsAdded++) { locations.add(translog.add(new Translog.Index("test", "" + opsAdded, lineFileDocs.nextDoc().toString().getBytes(Charset.forName("UTF-8"))))); Translog.Snapshot snapshot = this.translog.newSnapshot(); assertEquals(opsAdded + 1, snapshot.totalOperations()); for (int i = 0; i < opsAdded; i++) { assertEquals("expected operation" + i + " to be in the current translog but wasn't", translog.currentFileGeneration(), locations.get(i).generation); Translog.Operation next = snapshot.next(); assertNotNull("operation " + i + " must be non-null", next); } } }
public void testLineDocs() throws IOException { Random r = random(); LineFileDocs lineFileDocs = new LineFileDocs(r); for (int i = 0; i < 10; i++) { int numDocs = TestUtil.nextInt(r, 1, 200); ByteArrayOutputStream bos = new ByteArrayOutputStream(); for (int j = 0; j < numDocs; j++) { String s = lineFileDocs.nextDoc().get("body"); bos.write(s.getBytes(StandardCharsets.UTF_8)); } doTest(bos.toByteArray()); } lineFileDocs.close(); }
public void testLineDocsThreads() throws Exception { final Random r = random(); int threadCount = TestUtil.nextInt(r, 2, 6); Thread[] threads = new Thread[threadCount]; final CountDownLatch startingGun = new CountDownLatch(1); for (int tid=0; tid < threadCount; tid++) { final long seed = r.nextLong(); threads[tid] = new Thread() { @Override public void run() { try { Random r = new Random(seed); startingGun.await(); LineFileDocs lineFileDocs = new LineFileDocs(r); for (int i = 0; i < 10; i++) { int numDocs = TestUtil.nextInt(r, 1, 200); ByteArrayOutputStream bos = new ByteArrayOutputStream(); for (int j = 0; j < numDocs; j++) { String s = lineFileDocs.nextDoc().get("body"); bos.write(s.getBytes(StandardCharsets.UTF_8)); } doTest(bos.toByteArray()); } lineFileDocs.close(); } catch (Exception e) { throw new RuntimeException(e); } } }; threads[tid].start(); } startingGun.countDown(); for (Thread t : threads) { t.join(); } }
public void testMixed() throws IOException { Random r = random(); LineFileDocs lineFileDocs = new LineFileDocs(r); for (int i = 0; i < 2; ++i) { ByteArrayOutputStream bos = new ByteArrayOutputStream(); int prevInt = r.nextInt(); long prevLong = r.nextLong(); while (bos.size() < 400000) { switch (r.nextInt(4)) { case 0: addInt(r, prevInt, bos); break; case 1: addLong(r, prevLong, bos); break; case 2: addString(lineFileDocs, bos); break; case 3: addBytes(r, bos); break; default: throw new IllegalStateException("Random is broken"); } } doTest(bos.toByteArray()); } }
@Override public void run() { try { final LineFileDocs docs = new LineFileDocs(random(), defaultCodecSupportsDocValues()); int numDocs = 0; while (System.nanoTime() < endTimeNanos) { final int what = random().nextInt(3); final NodeState node = nodes[random().nextInt(nodes.length)]; if (numDocs == 0 || what == 0) { node.writer.addDocument(docs.nextDoc()); numDocs++; } else if (what == 1) { node.writer.updateDocument(new Term("docid", ""+random().nextInt(numDocs)), docs.nextDoc()); numDocs++; } else { node.writer.deleteDocuments(new Term("docid", ""+random().nextInt(numDocs))); } // TODO: doc blocks too if (random().nextInt(17) == 12) { node.writer.commit(); } if (random().nextInt(17) == 12) { nodes[random().nextInt(nodes.length)].reopen(); } } } catch (Throwable t) { System.out.println("FAILED:"); t.printStackTrace(System.out); throw new RuntimeException(t); } }
/** * populates a writer with random stuff. this must be fully reproducable with * the seed! */ public static void createRandomIndex(int numdocs, RandomIndexWriter writer, Random random) throws IOException { LineFileDocs lineFileDocs = new LineFileDocs(random); for (int i = 0; i < numdocs; i++) { writer.addDocument(lineFileDocs.nextDoc()); } lineFileDocs.close(); }
public void testFloatNorms() throws IOException { Directory dir = newDirectory(); MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig config = newIndexWriterConfig(analyzer); Similarity provider = new MySimProvider(); config.setSimilarity(provider); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config); final LineFileDocs docs = new LineFileDocs(random()); int num = atLeast(100); for (int i = 0; i < num; i++) { Document doc = docs.nextDoc(); float nextFloat = random().nextFloat(); Field f = new TextField(floatTestField, "" + nextFloat, Field.Store.YES); f.setBoost(nextFloat); doc.add(f); writer.addDocument(doc); doc.removeField(floatTestField); if (rarely()) { writer.commit(); } } writer.commit(); writer.close(); AtomicReader open = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir)); NumericDocValues norms = open.getNormValues(floatTestField); assertNotNull(norms); for (int i = 0; i < open.maxDoc(); i++) { Document document = open.document(i); float expected = Float.parseFloat(document.get(floatTestField)); assertEquals(expected, Float.intBitsToFloat((int)norms.get(i)), 0.0f); } open.close(); dir.close(); docs.close(); }
public IndexThread(AtomicInteger pendingDocs, int numThreads, IndexWriter writer, LineFileDocs docs, boolean doRandomCommit) { this.pendingDocs = pendingDocs; this.writer = writer; iwc = writer.getConfig(); this.docs = docs; this.doRandomCommit = doRandomCommit; }
/** * populates a writer with random stuff. this must be fully reproducable with the seed! */ public static void createRandomIndex(int numdocs, RandomIndexWriter writer, long seed) throws IOException { Random random = new Random(seed); // primary source for our data is from linefiledocs, its realistic. LineFileDocs lineFileDocs = new LineFileDocs(random); // TODO: we should add other fields that use things like docs&freqs but omit positions, // because linefiledocs doesn't cover all the possibilities. for (int i = 0; i < numdocs; i++) { Document document = lineFileDocs.nextDoc(); // grab the title and add some SortedSet instances for fun String title = document.get("titleTokenized"); String split[] = title.split("\\s+"); for (String trash : split) { document.add(new SortedSetDocValuesField("sortedset", new BytesRef(trash))); } // add a numeric dv field sometimes document.removeFields("sparsenumeric"); if (random.nextInt(4) == 2) { document.add(new NumericDocValuesField("sparsenumeric", random.nextInt())); } // add sortednumeric sometimes document.removeFields("sparsesortednum"); if (random.nextInt(5) == 1) { document.add(new SortedNumericDocValuesField("sparsesortednum", random.nextLong())); if (random.nextBoolean()) { document.add(new SortedNumericDocValuesField("sparsesortednum", random.nextLong())); } } writer.addDocument(document); } lineFileDocs.close(); }
public void testFloatNorms() throws IOException { Directory dir = newDirectory(); IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); Similarity provider = new MySimProvider(); config.setSimilarity(provider); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config); final LineFileDocs docs = new LineFileDocs(random()); int num = atLeast(100); for (int i = 0; i < num; i++) { Document doc = docs.nextDoc(); float nextFloat = random().nextFloat(); Field f = new TextField(floatTestField, "" + nextFloat, Field.Store.YES); f.setBoost(nextFloat); doc.add(f); writer.addDocument(doc); doc.removeField(floatTestField); if (rarely()) { writer.commit(); } } writer.commit(); writer.close(); AtomicReader open = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir)); NumericDocValues norms = open.getNormValues(floatTestField); assertNotNull(norms); for (int i = 0; i < open.maxDoc(); i++) { Document document = open.document(i); float expected = Float.parseFloat(document.get(floatTestField)); assertEquals(expected, Float.intBitsToFloat((int)norms.get(i)), 0.0f); } open.close(); dir.close(); docs.close(); }
private void addString(LineFileDocs lineFileDocs, ByteArrayOutputStream bos) throws IOException { String s = lineFileDocs.nextDoc().get("body"); bos.write(s.getBytes(StandardCharsets.UTF_8)); }
@Ignore public void testWiki() throws Exception { final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false); // Skip header: lfd.nextDoc(); FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(random())); sug.build(new InputIterator() { private int count; @Override public long weight() { return 1; } @Override public Comparator<BytesRef> getComparator() { return null; } @Override public BytesRef next() { Document doc; try { doc = lfd.nextDoc(); } catch (IOException ioe) { throw new RuntimeException(ioe); } if (doc == null) { return null; } if (count++ == 10000) { return null; } return new BytesRef(doc.get("body")); } @Override public BytesRef payload() { return null; } @Override public boolean hasPayloads() { return false; } @Override public Set<BytesRef> contexts() { return null; } @Override public boolean hasContexts() { return false; } }); if (VERBOSE) { System.out.println(sug.ramBytesUsed() + " bytes"); List<LookupResult> results = sug.lookup("general r", 10); System.out.println("results:"); for(LookupResult result : results) { System.out.println(" " + result); } } }
@BeforeClass public static void beforeClass() throws Exception { lineDocFile = new LineFileDocs(random(), defaultCodecSupportsDocValues()); }
public void testNRTAndCommit() throws Exception { Directory dir = newDirectory(); NRTCachingDirectory cachedDir = new NRTCachingDirectory(dir, 2.0, 25.0); MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig conf = newIndexWriterConfig(analyzer); RandomIndexWriter w = new RandomIndexWriter(random(), cachedDir, conf); final LineFileDocs docs = new LineFileDocs(random(), defaultCodecSupportsDocValues()); final int numDocs = TestUtil.nextInt(random(), 100, 400); if (VERBOSE) { System.out.println("TEST: numDocs=" + numDocs); } final List<BytesRef> ids = new ArrayList<>(); DirectoryReader r = null; for(int docCount=0;docCount<numDocs;docCount++) { final Document doc = docs.nextDoc(); ids.add(new BytesRef(doc.get("docid"))); w.addDocument(doc); if (random().nextInt(20) == 17) { if (r == null) { r = DirectoryReader.open(w.w, false); } else { final DirectoryReader r2 = DirectoryReader.openIfChanged(r); if (r2 != null) { r.close(); r = r2; } } assertEquals(1+docCount, r.numDocs()); final IndexSearcher s = newSearcher(r); // Just make sure search can run; we can't assert // totHits since it could be 0 TopDocs hits = s.search(new TermQuery(new Term("body", "the")), 10); // System.out.println("tot hits " + hits.totalHits); } } if (r != null) { r.close(); } // Close should force cache to clear since all files are sync'd w.close(); final String[] cachedFiles = cachedDir.listCachedFiles(); for(String file : cachedFiles) { System.out.println("FAIL: cached file " + file + " remains after sync"); } assertEquals(0, cachedFiles.length); r = DirectoryReader.open(dir); for(BytesRef id : ids) { assertEquals(1, r.docFreq(new Term("docid", id))); } r.close(); cachedDir.close(); docs.close(); }
public void testNRTAndCommit() throws Exception { Directory dir = newDirectory(); NRTCachingDirectory cachedDir = new NRTCachingDirectory(dir, 2.0, 25.0); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); RandomIndexWriter w = new RandomIndexWriter(random(), cachedDir, conf); final LineFileDocs docs = new LineFileDocs(random(), defaultCodecSupportsDocValues()); final int numDocs = _TestUtil.nextInt(random(), 100, 400); if (VERBOSE) { System.out.println("TEST: numDocs=" + numDocs); } final List<BytesRef> ids = new ArrayList<BytesRef>(); DirectoryReader r = null; for(int docCount=0;docCount<numDocs;docCount++) { final Document doc = docs.nextDoc(); ids.add(new BytesRef(doc.get("docid"))); w.addDocument(doc); if (random().nextInt(20) == 17) { if (r == null) { r = DirectoryReader.open(w.w, false); } else { final DirectoryReader r2 = DirectoryReader.openIfChanged(r); if (r2 != null) { r.close(); r = r2; } } assertEquals(1+docCount, r.numDocs()); final IndexSearcher s = new IndexSearcher(r); // Just make sure search can run; we can't assert // totHits since it could be 0 TopDocs hits = s.search(new TermQuery(new Term("body", "the")), 10); // System.out.println("tot hits " + hits.totalHits); } } if (r != null) { r.close(); } // Close should force cache to clear since all files are sync'd w.close(); final String[] cachedFiles = cachedDir.listCachedFiles(); for(String file : cachedFiles) { System.out.println("FAIL: cached file " + file + " remains after sync"); } assertEquals(0, cachedFiles.length); r = DirectoryReader.open(dir); for(BytesRef id : ids) { assertEquals(1, r.docFreq(new Term("docid", id))); } r.close(); cachedDir.close(); docs.close(); }
@Ignore public void testWiki() throws Exception { final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false); // Skip header: lfd.nextDoc(); FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(random())); sug.build(new InputIterator() { private int count; @Override public long weight() { return 1; } @Override public Comparator<BytesRef> getComparator() { return null; } @Override public BytesRef next() { Document doc; try { doc = lfd.nextDoc(); } catch (IOException ioe) { throw new RuntimeException(ioe); } if (doc == null) { return null; } if (count++ == 10000) { return null; } return new BytesRef(doc.get("body")); } @Override public BytesRef payload() { return null; } @Override public boolean hasPayloads() { return false; } }); if (VERBOSE) { System.out.println(sug.sizeInBytes() + " bytes"); List<LookupResult> results = sug.lookup("general r", 10); System.out.println("results:"); for(LookupResult result : results) { System.out.println(" " + result); } } }
public void testNRTAndCommit() throws Exception { Directory dir = newDirectory(); NRTCachingDirectory cachedDir = new NRTCachingDirectory(dir, 2.0, 25.0); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); RandomIndexWriter w = new RandomIndexWriter(random(), cachedDir, conf); final LineFileDocs docs = new LineFileDocs(random(), defaultCodecSupportsDocValues()); final int numDocs = _TestUtil.nextInt(random(), 100, 400); if (VERBOSE) { System.out.println("TEST: numDocs=" + numDocs); } final List<BytesRef> ids = new ArrayList<BytesRef>(); DirectoryReader r = null; for(int docCount=0;docCount<numDocs;docCount++) { final Document doc = docs.nextDoc(); ids.add(new BytesRef(doc.get("docid"))); w.addDocument(doc); if (random().nextInt(20) == 17) { if (r == null) { r = DirectoryReader.open(w.w, false); } else { final DirectoryReader r2 = DirectoryReader.openIfChanged(r); if (r2 != null) { r.close(); r = r2; } } assertEquals(1+docCount, r.numDocs()); final IndexSearcher s = newSearcher(r); // Just make sure search can run; we can't assert // totHits since it could be 0 TopDocs hits = s.search(new TermQuery(new Term("body", "the")), 10); // System.out.println("tot hits " + hits.totalHits); } } if (r != null) { r.close(); } // Close should force cache to clear since all files are sync'd w.close(); final String[] cachedFiles = cachedDir.listCachedFiles(); for(String file : cachedFiles) { System.out.println("FAIL: cached file " + file + " remains after sync"); } assertEquals(0, cachedFiles.length); r = DirectoryReader.open(dir); for(BytesRef id : ids) { assertEquals(1, r.docFreq(new Term("docid", id))); } r.close(); cachedDir.close(); docs.close(); }