@Override public BytesRefIterator iterator() throws IOException { if (sorted == null) { closeWriter(); sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir()); sort.sort(input, sorted); input.delete(); input = null; } return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted), sort.getComparator()); }
/** * Will buffer all sequences to a temporary file and then sort (all on-disk). */ public ExternalRefSorter(Sort sort) throws IOException { this.sort = sort; this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir()); this.writer = new Sort.ByteSequencesWriter(input); }
public static void main(String[] args) throws IOException { File input = new File("/home/dweiss/tmp/shuffled.dict"); int buckets = 20; int shareMaxTail = 10; ExternalRefSorter sorter = new ExternalRefSorter(new Sort()); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail); BufferedReader reader = new BufferedReader( new InputStreamReader( new FileInputStream(input), "UTF-8")); BytesRef scratch = new BytesRef(); String line; int count = 0; while ((line = reader.readLine()) != null) { scratch.copyChars(line); builder.add(scratch, count % buckets); if ((count++ % 100000) == 0) { System.err.println("Line: " + count); } } System.out.println("Building FSTCompletion."); FSTCompletion completion = builder.build(); File fstFile = new File("completion.fst"); System.out.println("Done. Writing automaton: " + fstFile.getAbsolutePath()); completion.getFST().save(fstFile); sorter.close(); }
@Test public void testIntermediateMerges() throws Exception { // Sort 20 mb worth of data with 1mb buffer, binary merging. SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2), generateRandom((int)Sort.MB * 20)); assertTrue(info.mergeRounds > 10); }
@Test public void testSmallRandom() throws Exception { // Sort 20 mb worth of data with 1mb buffer. SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), generateRandom((int)Sort.MB * 20)); assertEquals(1, sortInfo.mergeRounds); }
/** * Check sorting data on an instance of {@link Sort}. */ private SortInfo checkSort(Sort sort, byte[][] data) throws IOException { File unsorted = writeAll("unsorted", data); Arrays.sort(data, unsignedByteOrderComparator); File golden = writeAll("golden", data); File sorted = new File(tempDir, "sorted"); SortInfo sortInfo = sort.sort(unsorted, sorted); //System.out.println("Input size [MB]: " + unsorted.length() / (1024 * 1024)); //System.out.println(sortInfo); assertFilesIdentical(golden, sorted); return sortInfo; }
private File writeAll(String name, byte[][] data) throws IOException { File file = new File(tempDir, name); ByteSequencesWriter w = new Sort.ByteSequencesWriter(file); for (byte [] datum : data) { w.write(datum); } w.close(); return file; }
@Test public void testExternalRefSorter() throws Exception { ExternalRefSorter s = new ExternalRefSorter(new Sort()); check(s); s.close(); }
@Test public void testEmpty() throws Exception { checkSort(new Sort(), new byte [][] {}); }
@Test public void testSingleLine() throws Exception { checkSort(new Sort(), new byte [][] { "Single line only.".getBytes("UTF-8") }); }
@Test @Nightly public void testLargerRandom() throws Exception { // Sort 100MB worth of data with 15mb buffer. checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), generateRandom((int)Sort.MB * 100)); }