private List<String> readCrawldb() throws IOException { Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME + "/part-00000/data"); System.out.println("reading:" + dbfile); Option rFile = SequenceFile.Reader.file(dbfile); @SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); ArrayList<String> read = new ArrayList<String>(); READ: do { Text key = new Text(); CrawlDatum value = new CrawlDatum(); if (!reader.next(key, value)) break READ; read.add(key.toString()); } while (true); return read; }
private HashMap<String, CrawlDatum> readCrawldbRecords() throws IOException { Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME + "/part-00000/data"); System.out.println("reading:" + dbfile); Option rFile = SequenceFile.Reader.file(dbfile); @SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>(); READ: do { Text key = new Text(); CrawlDatum value = new CrawlDatum(); if (!reader.next(key, value)) break READ; read.put(key.toString(), value); } while (true); return read; }
/** * Read contents of fetchlist. * * @param fetchlist * path to Generated fetchlist * @return Generated {@link URLCrawlDatum} objects * @throws IOException */ private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException { // verify results Option fFile = SequenceFile.Reader.file(fetchlist); SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile); ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>(); READ: do { Text key = new Text(); CrawlDatum value = new CrawlDatum(); if (!reader.next(key, value)) { break READ; } l.add(new URLCrawlDatum(key, value)); } while (true); reader.close(); return l; }
/** * Read contents of fetchlist. * * @param fetchlist * path to Generated fetchlist * @return Generated {@link URLCrawlDatum} objects * @throws IOException */ private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException { // verify results Option rFile = SequenceFile.Reader.file(fetchlist); SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>(); READ: do { Text key = new Text(); CrawlDatum value = new CrawlDatum(); if (!reader.next(key, value)) { break READ; } l.add(new URLCrawlDatum(key, value)); } while (true); reader.close(); return l; }
public CubeStatsResult(Path path, int precision) throws IOException { Configuration hadoopConf = HadoopUtil.getCurrentConfiguration(); Option seqInput = SequenceFile.Reader.file(path); try (Reader reader = new SequenceFile.Reader(hadoopConf, seqInput)) { LongWritable key = (LongWritable) ReflectionUtils.newInstance(reader.getKeyClass(), hadoopConf); BytesWritable value = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), hadoopConf); while (reader.next(key, value)) { if (key.get() == 0L) { percentage = Bytes.toInt(value.getBytes()); } else if (key.get() == -1) { mapperOverlapRatio = Bytes.toDouble(value.getBytes()); } else if (key.get() == -2) { mapperNumber = Bytes.toInt(value.getBytes()); } else if (key.get() > 0) { HLLCounter hll = new HLLCounter(precision); ByteArray byteArray = new ByteArray(value.getBytes()); hll.readRegisters(byteArray.asBuffer()); counterMap.put(key.get(), hll); } } } }
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration conf = context.getConfiguration(); final Path path = split.getPath(); Option optPath = SequenceFile.Reader.file(path); in = new SequenceFile.Reader(conf, optPath); this.end = split.getStart() + inputSplit.getLength(); if (split.getStart() > in.getPosition()) { in.sync(split.getStart()); } start = in.getPosition(); done = start >= end; }
@Override protected boolean doProcess(Record inputRecord, InputStream in) throws IOException { FSDataInputStream fsInputStream = new FSDataInputStream(new ForwardOnlySeekable(in)); Option opt = SequenceFile.Reader.stream(fsInputStream); SequenceFile.Metadata sequenceFileMetaData = null; SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader(conf, opt); if (includeMetaData) { sequenceFileMetaData = reader.getMetadata(); } Class keyClass = reader.getKeyClass(); Class valueClass = reader.getValueClass(); Record template = inputRecord.copy(); removeAttachments(template); while (true) { Writable key = (Writable)ReflectionUtils.newInstance(keyClass, conf); Writable val = (Writable)ReflectionUtils.newInstance(valueClass, conf); try { if (!reader.next(key, val)) { break; } } catch (EOFException ex) { // SequenceFile.Reader will throw an EOFException after reading // all the data, if it doesn't know the length. Since we are // passing in an InputStream, we hit this case; LOG.trace("Received expected EOFException", ex); break; } incrementNumRecords(); Record outputRecord = template.copy(); outputRecord.put(keyField, key); outputRecord.put(valueField, val); outputRecord.put(Fields.ATTACHMENT_MIME_TYPE, OUTPUT_MEDIA_TYPE); if (includeMetaData && sequenceFileMetaData != null) { outputRecord.put(SEQUENCE_FILE_META_DATA, sequenceFileMetaData); } // pass record to next command in chain: if (!getChild().process(outputRecord)) { return false; } } } finally { Closeables.closeQuietly(reader); } return true; }