@Override public BytesRefIterator iterator() throws IOException { if (sorted == null) { closeWriter(); sorted = File.createTempFile("RefSorter-", ".sorted", OfflineSorter.defaultTempDir()); sort.sort(input, sorted); input.delete(); input = null; } return new ByteSequenceIterator(new OfflineSorter.ByteSequencesReader(sorted), sort.getComparator()); }
/** * Will buffer all sequences to a temporary file and then sort (all on-disk). */ public ExternalRefSorter(OfflineSorter sort) throws IOException { this.sort = sort; this.input = File.createTempFile("RefSorter-", ".raw", OfflineSorter.defaultTempDir()); this.writer = new OfflineSorter.ByteSequencesWriter(input); }
private ByteSequencesReader sort() throws IOException { String prefix = getClass().getSimpleName(); File directory = OfflineSorter.defaultTempDir(); tempInput = File.createTempFile(prefix, ".input", directory); tempSorted = File.createTempFile(prefix, ".sorted", directory); final ByteSequencesWriter writer = new ByteSequencesWriter(tempInput); boolean success = false; try { BytesRef spare; byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); while ((spare = source.next()) != null) { encode(writer, output, buffer, spare, source.weight()); } writer.close(); new OfflineSorter(tieBreakByCostComparator).sort(tempInput, tempSorted); ByteSequencesReader reader = new ByteSequencesReader(tempSorted); success = true; return reader; } finally { if (success) { IOUtils.close(writer); } else { try { IOUtils.closeWhileHandlingException(writer); } finally { close(); } } } }
private ByteSequencesReader sort() throws IOException { String prefix = getClass().getSimpleName(); File directory = OfflineSorter.defaultTempDir(); tempInput = File.createTempFile(prefix, ".input", directory); tempSorted = File.createTempFile(prefix, ".sorted", directory); final OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); boolean success = false; try { BytesRef spare; byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); while ((spare = source.next()) != null) { encode(writer, output, buffer, spare, source.payload(), source.contexts(), source.weight()); } writer.close(); new OfflineSorter(tieBreakByCostComparator).sort(tempInput, tempSorted); ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(tempSorted); success = true; return reader; } finally { if (success) { IOUtils.close(writer); } else { try { IOUtils.closeWhileHandlingException(writer); } finally { close(); } } } }
public static void main(String[] args) throws IOException { File input = new File("/home/dweiss/tmp/shuffled.dict"); int buckets = 20; int shareMaxTail = 10; ExternalRefSorter sorter = new ExternalRefSorter(new OfflineSorter()); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail); BufferedReader reader = new BufferedReader( new InputStreamReader( new FileInputStream(input), StandardCharsets.UTF_8)); BytesRefBuilder scratch = new BytesRefBuilder(); String line; int count = 0; while ((line = reader.readLine()) != null) { scratch.copyChars(line); builder.add(scratch.get(), count % buckets); if ((count++ % 100000) == 0) { System.err.println("Line: " + count); } } System.out.println("Building FSTCompletion."); FSTCompletion completion = builder.build(); File fstFile = new File("completion.fst"); System.out.println("Done. Writing automaton: " + fstFile.getAbsolutePath()); completion.getFST().save(fstFile); sorter.close(); }
/** * Check sorting data on an instance of {@link OfflineSorter}. */ private SortInfo checkSort(OfflineSorter sort, byte[][] data) throws IOException { File unsorted = writeAll("unsorted", data); Arrays.sort(data, unsignedByteOrderComparator); File golden = writeAll("golden", data); File sorted = new File(tempDir, "sorted"); SortInfo sortInfo = sort.sort(unsorted, sorted); //System.out.println("Input size [MB]: " + unsorted.length() / (1024 * 1024)); //System.out.println(sortInfo); assertFilesIdentical(golden, sorted); return sortInfo; }
private File writeAll(String name, byte[][] data) throws IOException { File file = new File(tempDir, name); ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(file); for (byte [] datum : data) { w.write(datum); } w.close(); return file; }
@Test public void testExternalRefSorter() throws Exception { ExternalRefSorter s = new ExternalRefSorter(new OfflineSorter()); check(s); s.close(); }
public void testEmpty() throws Exception { checkSort(new OfflineSorter(), new byte [][] {}); }
public void testSingleLine() throws Exception { checkSort(new OfflineSorter(), new byte [][] { "Single line only.".getBytes(StandardCharsets.UTF_8) }); }
public void testIntermediateMerges() throws Exception { // Sort 20 mb worth of data with 1mb buffer, binary merging. SortInfo info = checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), OfflineSorter.defaultTempDir(), 2), generateRandom((int)OfflineSorter.MB * 20)); assertTrue(info.mergeRounds > 10); }
public void testSmallRandom() throws Exception { // Sort 20 mb worth of data with 1mb buffer. SortInfo sortInfo = checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), OfflineSorter.defaultTempDir(), OfflineSorter.MAX_TEMPFILES), generateRandom((int)OfflineSorter.MB * 20)); assertEquals(1, sortInfo.mergeRounds); }
@Nightly public void testLargerRandom() throws Exception { // Sort 100MB worth of data with 15mb buffer. checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(16), OfflineSorter.defaultTempDir(), OfflineSorter.MAX_TEMPFILES), generateRandom((int)OfflineSorter.MB * 100)); }