public void testUTF8() throws Exception { LineReader in = null; try { in = makeStream("abcd\u20acbdcd\u20ac"); Text line = new Text(); in.readLine(line); assertEquals("readLine changed utf8 characters", "abcd\u20acbdcd\u20ac", line.toString()); in = makeStream("abc\u200axyz"); in.readLine(line); assertEquals("split on fake newline", "abc\u200axyz", line.toString()); } finally { if (in != null) { in.close(); } } }
public void testNewLines() throws Exception { LineReader in = null; try { in = makeStream("a\nbb\n\nccc\rdddd\r\neeeee"); Text out = new Text(); in.readLine(out); assertEquals("line1 length", 1, out.getLength()); in.readLine(out); assertEquals("line2 length", 2, out.getLength()); in.readLine(out); assertEquals("line3 length", 0, out.getLength()); in.readLine(out); assertEquals("line4 length", 3, out.getLength()); in.readLine(out); assertEquals("line5 length", 4, out.getLength()); in.readLine(out); assertEquals("line5 length", 5, out.getLength()); assertEquals("end of file", 0, in.readLine(out)); } finally { if (in != null) { in.close(); } } }
/** * Test readLine for correct interpretation of maxLineLength * (returned string should be clipped at maxLineLength, and the * remaining bytes on the same line should be thrown out). * Also check that returned value matches the string length. * Varies buffer size to stress test. * * @throws Exception */ @Test (timeout=5000) public void testMaxLineLength() throws Exception { final String STR = "a\nbb\n\nccc\rdddd\r\neeeee"; final int STRLENBYTES = STR.getBytes().length; Text out = new Text(); for (int bufsz = 1; bufsz < STRLENBYTES+1; ++bufsz) { LineReader in = makeStream(STR, bufsz); int c = 0; c += in.readLine(out, 1); assertEquals("line1 length, bufsz: "+bufsz, 1, out.getLength()); c += in.readLine(out, 1); assertEquals("line2 length, bufsz: "+bufsz, 1, out.getLength()); c += in.readLine(out, 1); assertEquals("line3 length, bufsz: "+bufsz, 0, out.getLength()); c += in.readLine(out, 3); assertEquals("line4 length, bufsz: "+bufsz, 3, out.getLength()); c += in.readLine(out, 10); assertEquals("line5 length, bufsz: "+bufsz, 4, out.getLength()); c += in.readLine(out, 8); assertEquals("line5 length, bufsz: "+bufsz, 5, out.getLength()); assertEquals("end of file, bufsz: " +bufsz, 0, in.readLine(out)); assertEquals("total bytes, bufsz: "+bufsz, c, STRLENBYTES); } }
@Test public void testNewLines() throws Exception { LineReader in = makeStream("a\nbb\n\nccc\rdddd\r\neeeee"); Text out = new Text(); in.readLine(out); assertEquals("line1 length", 1, out.getLength()); in.readLine(out); assertEquals("line2 length", 2, out.getLength()); in.readLine(out); assertEquals("line3 length", 0, out.getLength()); in.readLine(out); assertEquals("line4 length", 3, out.getLength()); in.readLine(out); assertEquals("line5 length", 4, out.getLength()); in.readLine(out); assertEquals("line5 length", 5, out.getLength()); assertEquals("end of file", 0, in.readLine(out)); }
public CombineFileLineRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException { this.path = split.getPath(index); fs = this.path.getFileSystem(context.getConfiguration()); this.startOffset = split.getOffset(index); this.end = startOffset + split.getLength(index); boolean skipFirstLine = false; //open the file fileIn = fs.open(path); if (startOffset != 0) { skipFirstLine = true; --startOffset; fileIn.seek(startOffset); } reader = new LineReader(fileIn); if (skipFirstLine) { // skip first line and re-establish "startOffset". startOffset += reader.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - startOffset)); } this.pos = startOffset; }
public SingleFastqRecordReader(Configuration conf, FileSplit split) throws IOException { file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) { // no codec. Uncompressed file. positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) { throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); } inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
/** * Returns a split for each store files directory using the block location * of each file as locality reference. */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); Text key = new Text(); for (FileStatus file: files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); LineReader reader = new LineReader(fs.open(path)); long pos = 0; int n; try { while ((n = reader.readLine(key)) > 0) { String[] hosts = getStoreDirHosts(fs, path); splits.add(new FileSplit(path, pos, n, hosts)); pos += n; } } finally { reader.close(); } } return splits; }
public static boolean distributeCache(String chrList, Job job, String cacheName) throws IOException, URISyntaxException { job.addCacheFile(new URI(chrList + "#" + cacheName)); Configuration conf = job.getConfiguration(); Path refPath = new Path(chrList); FileSystem fs = refPath.getFileSystem(conf); FSDataInputStream refin = fs.open(refPath); LineReader in = new LineReader(refin); Text line = new Text(); String chrFile = ""; String[] chrs = new String[3]; while ((in.readLine(line)) != 0) { chrFile = line.toString(); chrs = chrFile.split("\t"); File fileTest = new File(chrs[1]); if (fileTest.isFile()) { chrs[1] = "file://" + chrs[1]; } job.addCacheFile(new URI(chrs[1] + "#" + chrs[0])); } in.close(); refin.close(); return true; }
protected void loadChromosomeList(Path refPath) throws NumberFormatException, IOException{ Configuration conf = new Configuration(); FileSystem fs = refPath.getFileSystem(conf); FSDataInputStream refin = fs.open(refPath); LineReader in = new LineReader(refin); Text line = new Text(); String chrFile = ""; String[] chrs = new String[3]; while((in.readLine(line)) != 0){ chrFile = line.toString(); chrs = chrFile.split("\t"); // insert chr if(!addChromosome(chrs[0])) { in.close(); throw new RuntimeException("map Chromosome "+chrs[1]+" Failed."); } setChromosome(chrs[1],chrs[0],Integer.parseInt(chrs[2])); } in.close(); }
public void readFromHdfs(Path path, Configuration conf, FastqQualityControlReport report) throws IOException { FileSystem fs = path.getFileSystem(conf); FSDataInputStream FSinput = fs.open(path); LineReader lineReader = new LineReader(FSinput, conf); Text line = new Text(); int sampleID = 0, i,cnt; while ((lineReader.readLine(line)) != 0) { sampleID = report.addCount(line.toString()); if(report.isPartitionNull()) continue; for (i = 0; i < FastqQualityControlReport.BASE_STATIC_COUNT; i++) { cnt = lineReader.readLine(line); if(cnt == 0) continue; report.addBaseByPosition(sampleID, i, line.toString()); } } lineReader.close(); }
@Override public void parseReport(LineReader lineReader, Text line, ReferenceShare genome) throws IOException { super.parseReport(lineReader, line, genome); String lineString = line.toString(); if(lineString.contains("Cover Information")) { if(lineReader.readLine(line) > 0 && line.getLength() != 0) { String[] splitArray = line.toString().split("\t"); WholeGenomeCoverReport coverReport = null; for(String keyValue : splitArray) { if(keyValue.split(" ").length == 1) { String chrName = keyValue; if(!coverReports.containsKey(chrName)) { ChromosomeInformationShare chrInfo = genome.getChromosomeInfo(chrName); coverReport = new WholeGenomeCoverReport(chrInfo); coverReports.put(chrName, coverReport); } else { coverReport = coverReports.get(chrName); } } else { assert coverReport != null; coverReport.parse(keyValue, genome); } } } } }
/** * Creates a mappings file from the contents of a flat text file containing docid to docno * mappings. This method is used by {@link WikipediaDocnoMappingBuilder} internally. * * @param inputFile flat text file containing docid to docno mappings * @param outputFile output mappings file * @throws IOException */ static public void writeDocnoMappingData(FileSystem fs, String inputFile, int n, String outputFile) throws IOException { LOG.info("Writing " + n + " docids to " + outputFile); LineReader reader = new LineReader(fs.open(new Path(inputFile))); int cnt = 0; Text line = new Text(); FSDataOutputStream out = fs.create(new Path(outputFile), true); out.writeInt(n); for (int i = 0; i < n; i++) { reader.readLine(line); String[] arr = line.toString().split("\\t"); out.writeInt(Integer.parseInt(arr[0])); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " articles"); } } out.close(); reader.close(); LOG.info("Done!"); }
/** * Test readLine for correct interpretation of maxLineLength * (returned string should be clipped at maxLineLength, and the * remaining bytes on the same line should be thrown out). * Also check that returned value matches the string length. * Varies buffer size to stress test. * * @throws Exception */ @Test public void testMaxLineLength() throws Exception { final String STR = "a\nbb\n\nccc\rdddd\r\neeeee"; final int STRLENBYTES = STR.getBytes().length; Text out = new Text(); for (int bufsz = 1; bufsz < STRLENBYTES+1; ++bufsz) { LineReader in = makeStream(STR, bufsz); int c = 0; c += in.readLine(out, 1); assertEquals("line1 length, bufsz: "+bufsz, 1, out.getLength()); c += in.readLine(out, 1); assertEquals("line2 length, bufsz: "+bufsz, 1, out.getLength()); c += in.readLine(out, 1); assertEquals("line3 length, bufsz: "+bufsz, 0, out.getLength()); c += in.readLine(out, 3); assertEquals("line4 length, bufsz: "+bufsz, 3, out.getLength()); c += in.readLine(out, 10); assertEquals("line5 length, bufsz: "+bufsz, 4, out.getLength()); c += in.readLine(out, 8); assertEquals("line5 length, bufsz: "+bufsz, 5, out.getLength()); assertEquals("end of file, bufsz: " +bufsz, 0, in.readLine(out)); assertEquals("total bytes, bufsz: "+bufsz, c, STRLENBYTES); } }
public void testNewLines() throws Exception { LineReader in = makeStream("a\nbb\n\nccc\rdddd\r\neeeee"); Text out = new Text(); in.readLine(out); assertEquals("line1 length", 1, out.getLength()); in.readLine(out); assertEquals("line2 length", 2, out.getLength()); in.readLine(out); assertEquals("line3 length", 0, out.getLength()); in.readLine(out); assertEquals("line4 length", 3, out.getLength()); in.readLine(out); assertEquals("line5 length", 4, out.getLength()); in.readLine(out); assertEquals("line5 length", 5, out.getLength()); assertEquals("end of file", 0, in.readLine(out)); }
/** * Constructor that reads the contents of the index file. * @param in An input stream to the index file. * @param max The size of the index file. * @throws IOException */ public HarIndex(InputStream in, long max) throws IOException { LineReader lineReader = new LineReader(in); Text text = new Text(); long nread = 0; while (nread < max) { int n = lineReader.readLine(text); nread += n; String line = text.toString(); try { parseLine(line); } catch (UnsupportedEncodingException e) { throw new IOException("UnsupportedEncodingException after reading " + nread + "bytes"); } } }
public int getHarVersion() throws IOException { FSDataInputStream masterIn = fs.open(masterIndex); LineReader lmaster = new LineReader(masterIn, getConf()); Text line = new Text(); lmaster.readLine(line); try { masterIn.close(); } catch(IOException e){ //disregard it. // its a read. } String versionLine = line.toString(); String[] arr = versionLine.split(" "); int version = Integer.parseInt(arr[0]); return version; }
@Test public void testUTF8() throws Exception { LineReader in = null; try { in = makeStream("abcd\u20acbdcd\u20ac"); Text line = new Text(); in.readLine(line); assertEquals("readLine changed utf8 characters", "abcd\u20acbdcd\u20ac", line.toString()); in = makeStream("abc\u200axyz"); in.readLine(line); assertEquals("split on fake newline", "abc\u200axyz", line.toString()); } finally { if (in != null) { in.close(); } } }