@Override /** {@inheritDoc} */ protected RecordReader<LongWritable, T> createDBRecordReader( DBInputSplit split, Configuration conf) throws IOException { DBConfiguration dbConf = getDBConf(); Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); String dbProductName = getDBProductName(); LOG.debug("Creating db record reader for db product: " + dbProductName); try { return new SQLServerDBRecordReader<T>(split, inputClass, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName); } catch (SQLException ex) { throw new IOException(ex); } }
@Override protected RecordReader<LongWritable, T> createDBRecordReader( DBInputSplit split, Configuration conf) throws IOException { DBConfiguration dbConf = getDBConf(); @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); try { // Use DB2-specific db reader return new Db2DataDrivenDBRecordReader<T>(split, inputClass, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName()); } catch (SQLException ex) { throw new IOException(ex); } }
protected RecordReader<LongWritable, T> createDBRecordReader( DBInputSplit split, Configuration conf) throws IOException { DBConfiguration dbConf = getDBConf(); @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); String dbProductName = getDBProductName(); LOG.debug("Creating db record reader for db product: " + dbProductName); try { return new DataDrivenDBRecordReader<T>(split, inputClass, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName); } catch (SQLException ex) { throw new IOException(ex); } }
@Override protected RecordReader<LongWritable, T> createDBRecordReader( DBInputSplit split, Configuration conf) throws IOException { DBConfiguration dbConf = getDBConf(); @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); try { // Use Oracle-specific db reader return new OracleDataDrivenDBRecordReader<T>(split, inputClass, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName()); } catch (SQLException ex) { throw new IOException(ex); } }
/** * Actually instantiate the user's chosen RecordReader implementation. */ @SuppressWarnings("unchecked") private void createChildReader() throws IOException, InterruptedException { LOG.debug("ChildSplit operates on: " + split.getPath(index)); Configuration conf = context.getConfiguration(); // Determine the file format we're reading. Class rrClass; if (ExportJobBase.isSequenceFiles(conf, split.getPath(index))) { rrClass = SequenceFileRecordReader.class; } else { rrClass = LineRecordReader.class; } // Create the appropriate record reader. this.rr = (RecordReader<LongWritable, Object>) ReflectionUtils.newInstance(rrClass, conf); }
/** {@inheritDoc} */ @Override protected RecordReader<LongWritable, T> createDBRecordReader( DBInputSplit split, Configuration conf) throws IOException { DBConfiguration dbConf = getDBConf(); @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); try { // Use Microsoft SQL Server specific db reader return new SqlServerRecordReader<T>(split, inputClass, conf, getConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName()); } catch (SQLException ex) { throw new IOException(ex); } }
private static List<Text> readSplit(KeyValueTextInputFormat format, InputSplit split, Job job) throws IOException, InterruptedException { List<Text> result = new ArrayList<Text>(); Configuration conf = job.getConfiguration(); TaskAttemptContext context = MapReduceTestUtil. createDummyMapTaskAttemptContext(conf); RecordReader<Text, Text> reader = format.createRecordReader(split, MapReduceTestUtil.createDummyMapTaskAttemptContext(conf)); MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(conf, context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); while (reader.nextKeyValue()) { result.add(new Text(reader.getCurrentValue())); } reader.close(); return result; }
@Test public void testReinit() throws Exception { // Test that a split containing multiple files works correctly, // with the child RecordReader getting its initialize() method // called a second time. TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0); Configuration conf = new Configuration(); TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId); // This will create a CombineFileRecordReader that itself contains a // DummyRecordReader. InputFormat inputFormat = new ChildRRInputFormat(); Path [] files = { new Path("file1"), new Path("file2") }; long [] lengths = { 1, 1 }; CombineFileSplit split = new CombineFileSplit(files, lengths); RecordReader rr = inputFormat.createRecordReader(split, context); assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader); // first initialize() call comes from MapTask. We'll do it here. rr.initialize(split, context); // First value is first filename. assertTrue(rr.nextKeyValue()); assertEquals("file1", rr.getCurrentValue().toString()); // The inner RR will return false, because it only emits one (k, v) pair. // But there's another sub-split to process. This returns true to us. assertTrue(rr.nextKeyValue()); // And the 2nd rr will have its initialize method called correctly. assertEquals("file2", rr.getCurrentValue().toString()); // But after both child RR's have returned their singleton (k, v), this // should also return false. assertFalse(rr.nextKeyValue()); }
private static List<Text> readSplit(InputFormat<LongWritable,Text> format, InputSplit split, Job job) throws IOException, InterruptedException { List<Text> result = new ArrayList<Text>(); Configuration conf = job.getConfiguration(); TaskAttemptContext context = MapReduceTestUtil. createDummyMapTaskAttemptContext(conf); RecordReader<LongWritable, Text> reader = format.createRecordReader(split, MapReduceTestUtil.createDummyMapTaskAttemptContext(conf)); MapContext<LongWritable,Text,LongWritable,Text> mcontext = new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf, context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); while (reader.nextKeyValue()) { result.add(new Text(reader.getCurrentValue())); } return result; }
@Override protected RecordReader<LongWritable, T> createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException { DBConfiguration dbConf = getDBConf(); @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); try { // Use Oracle-specific db reader return new OracleDataDrivenDBRecordReader<T>(split, inputClass, conf, createConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName()); } catch (SQLException ex) { throw new IOException(ex.getMessage()); } }
/** * Add mapper(the first mapper) that reads input from the input * context and writes to queue */ @SuppressWarnings("unchecked") void addMapper(TaskInputOutputContext inputContext, ChainBlockingQueue<KeyValuePair<?, ?>> output, int index) throws IOException, InterruptedException { Configuration conf = getConf(index); Class<?> keyOutClass = conf.getClass(MAPPER_OUTPUT_KEY_CLASS, Object.class); Class<?> valueOutClass = conf.getClass(MAPPER_OUTPUT_VALUE_CLASS, Object.class); RecordReader rr = new ChainRecordReader(inputContext); RecordWriter rw = new ChainRecordWriter(keyOutClass, valueOutClass, output, conf); Mapper.Context mapperContext = createMapContext(rr, rw, (MapContext) inputContext, getConf(index)); MapRunner runner = new MapRunner(mappers.get(index), mapperContext, rr, rw); threads.add(runner); }
/** * Add mapper that reads and writes from/to the queue */ @SuppressWarnings("unchecked") void addMapper(ChainBlockingQueue<KeyValuePair<?, ?>> input, ChainBlockingQueue<KeyValuePair<?, ?>> output, TaskInputOutputContext context, int index) throws IOException, InterruptedException { Configuration conf = getConf(index); Class<?> keyClass = conf.getClass(MAPPER_INPUT_KEY_CLASS, Object.class); Class<?> valueClass = conf.getClass(MAPPER_INPUT_VALUE_CLASS, Object.class); Class<?> keyOutClass = conf.getClass(MAPPER_OUTPUT_KEY_CLASS, Object.class); Class<?> valueOutClass = conf.getClass(MAPPER_OUTPUT_VALUE_CLASS, Object.class); RecordReader rr = new ChainRecordReader(keyClass, valueClass, input, conf); RecordWriter rw = new ChainRecordWriter(keyOutClass, valueOutClass, output, conf); MapRunner runner = new MapRunner(mappers.get(index), createMapContext(rr, rw, context, getConf(index)), rr, rw); threads.add(runner); }
@Override public RecordReader<LongWritable, LongWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { int taskId = context.getTaskAttemptID().getTaskID().getId(); int numMapTasks = context.getConfiguration().getInt(NUM_MAPS_KEY, NUM_MAPS); int numIterations = context.getConfiguration().getInt(NUM_IMPORT_ROUNDS_KEY, NUM_IMPORT_ROUNDS); int iteration = context.getConfiguration().getInt(ROUND_NUM_KEY, 0); taskId = taskId + iteration * numMapTasks; numMapTasks = numMapTasks * numIterations; long chainId = Math.abs(new Random().nextLong()); chainId = chainId - (chainId % numMapTasks) + taskId; // ensure that chainId is unique per task and across iterations LongWritable[] keys = new LongWritable[] {new LongWritable(chainId)}; return new FixedRecordReader<LongWritable, LongWritable>(keys, keys); }
@SuppressWarnings({"rawtypes", "unchecked"}) public void initReader() throws IOException { try { Configuration conf = WorkerContext.get().getConf(); String inputFormatClassName = conf.get(AngelConf.ANGEL_INPUTFORMAT_CLASS, AngelConf.DEFAULT_ANGEL_INPUTFORMAT_CLASS); Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormatClass = (Class<? extends org.apache.hadoop.mapreduce.InputFormat>) Class .forName(inputFormatClassName); org.apache.hadoop.mapreduce.InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass, new JobConf(conf)); MRTaskContext taskContext = new MRTaskContext(conf); org.apache.hadoop.mapreduce.RecordReader<KEY, VALUE> recordReader = inputFormat.createRecordReader(split, taskContext); recordReader.initialize(split, taskContext); setReader(new DFSReaderNewAPI(recordReader)); } catch (Exception x) { LOG.error("init reader error ", x); throw new IOException(x); } }
@Override public RecordReader<LongWritable, Text> createRecordReader( InputSplit inputSplit, TaskAttemptContext context) { try { return new XMLRecordReader(inputSplit, context.getConfiguration()); } catch (IOException e) { return null; } }
@Test public void getSplits() throws Exception { S3MapReduceCpOptions options = getOptions(); Configuration configuration = new Configuration(); configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps())); CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing( new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testDynInputFormat/fileList.seq"), options); JobContext jobContext = new JobContextImpl(configuration, new JobID()); DynamicInputFormat<Text, CopyListingFileStatus> inputFormat = new DynamicInputFormat<>(); List<InputSplit> splits = inputFormat.getSplits(jobContext); int nFiles = 0; int taskId = 0; for (InputSplit split : splits) { RecordReader<Text, CopyListingFileStatus> recordReader = inputFormat.createRecordReader(split, null); StubContext stubContext = new StubContext(jobContext.getConfiguration(), recordReader, taskId); final TaskAttemptContext taskAttemptContext = stubContext.getContext(); recordReader.initialize(splits.get(0), taskAttemptContext); float previousProgressValue = 0f; while (recordReader.nextKeyValue()) { CopyListingFileStatus fileStatus = recordReader.getCurrentValue(); String source = fileStatus.getPath().toString(); assertTrue(expectedFilePaths.contains(source)); final float progress = recordReader.getProgress(); assertTrue(progress >= previousProgressValue); assertTrue(progress >= 0.0f); assertTrue(progress <= 1.0f); previousProgressValue = progress; ++nFiles; } assertTrue(recordReader.getProgress() == 1.0f); ++taskId; } Assert.assertEquals(expectedFilePaths.size(), nFiles); }
public StubContext(Configuration conf, RecordReader<Text, CopyListingFileStatus> reader, int taskId) throws IOException, InterruptedException { WrappedMapper<Text, CopyListingFileStatus, Text, Text> wrappedMapper = new WrappedMapper<>(); MapContextImpl<Text, CopyListingFileStatus, Text, Text> contextImpl = new MapContextImpl<>(conf, getTaskAttemptID(taskId), reader, writer, null, reporter, null); this.reader = reader; mapperContext = wrappedMapper.getMapContext(contextImpl); }
@Override @SuppressWarnings("unchecked") public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException { CombineFileSplit combineSplit = (CombineFileSplit) split; // Use CombineFileRecordReader since this can handle CombineFileSplits // and instantiate another RecordReader in a loop; do this with the // CombineShimRecordReader. RecordReader rr = new CombineFileRecordReader(combineSplit, context, CombineShimRecordReader.class); return rr; }
protected RecordReader<LongWritable, T> createDBRecordReader( com.cloudera.sqoop.mapreduce.db.DBInputFormat.DBInputSplit split, Configuration conf) throws IOException { @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); try { // use database product name to determine appropriate record reader. if (dbProductName.startsWith("ORACLE")) { // use Oracle-specific db reader. return new OracleDBRecordReader<T>(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName); } else if (dbProductName.startsWith("DB2")) { // use DB2-specific db reader. return new Db2DBRecordReader<T>(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName); } else { // Generic reader. return new DBRecordReader<T>(split, inputClass, conf, getConnection(), getDBConf(), conditions, fieldNames, tableName); } } catch (SQLException ex) { throw new IOException(ex); } }
@Override /** {@inheritDoc} */ public RecordReader<LongWritable, T> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return createDBRecordReader( (com.cloudera.sqoop.mapreduce.db.DBInputFormat.DBInputSplit) split, context.getConfiguration()); }
@Override public RecordReader<WritableComparable, HCatRecord> createRecordReader(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException { LOG.debug("Creating a SqoopHCatRecordReader"); return new SqoopHCatRecordReader(split, taskContext, this); }
public RecordReader<WritableComparable, HCatRecord> createHCatRecordReader(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException { LOG.debug("Creating a base HCatRecordReader"); return super.createRecordReader(split, taskContext); }
@Override public RecordReader<AvroWrapper<T>, NullWritable> createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(split.toString()); return new AvroRecordReader<T>(); }
@Override public RecordReader<Tuple2<Path, Long>, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { if (LOG.isTraceEnabled()) { LOG.trace("create LogfileRecordReader"); } return new LogfileRecordReader(); }
@Override public org.apache.hadoop.mapred.RecordReader<IntWritable, NullWritable> getRecordReader(final org.apache.hadoop.mapred.InputSplit split, JobConf job, Reporter reporter) throws IOException { return new org.apache.hadoop.mapred.RecordReader <IntWritable, NullWritable>() { private final IntWritable i = new IntWritable(((MapredSequentialSplit)split).getInit()); private int maxVal = i.get() + maxDepth + 1; @Override public boolean next(IntWritable key, NullWritable value) throws IOException { i.set(i.get() + 1); return i.get() < maxVal; } @Override public IntWritable createKey() { return new IntWritable(i.get()); } @Override public NullWritable createValue() { return NullWritable.get(); } @Override public long getPos() throws IOException { return 0; } @Override public void close() throws IOException { } @Override public float getProgress() throws IOException { return 0; } }; }
private int countRecords(int numSplits) throws IOException, InterruptedException { InputFormat<Text, BytesWritable> format = new SequenceFileInputFilter<Text, BytesWritable>(); if (numSplits == 0) { numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1; } FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(inFile).getLen() / numSplits); TaskAttemptContext context = MapReduceTestUtil. createDummyMapTaskAttemptContext(job.getConfiguration()); // check each split int count = 0; for (InputSplit split : format.getSplits(job)) { RecordReader<Text, BytesWritable> reader = format.createRecordReader(split, context); MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = new MapContextImpl<Text, BytesWritable, Text, BytesWritable>( job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { while (reader.nextKeyValue()) { LOG.info("Accept record " + reader.getCurrentKey().toString()); count++; } } finally { reader.close(); } } return count; }
@SuppressWarnings("unchecked") @Override public RecordReader<Text,Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { return new CombineFileRecordReader((CombineFileSplit) split, context, (Class) DummyRecordReader.class); }
@Test public void testRecordReaderInit() throws InterruptedException, IOException { // Test that we properly initialize the child recordreader when // CombineFileInputFormat and CombineFileRecordReader are used. TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0); Configuration conf1 = new Configuration(); conf1.set(DUMMY_KEY, "STATE1"); TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId); // This will create a CombineFileRecordReader that itself contains a // DummyRecordReader. InputFormat inputFormat = new ChildRRInputFormat(); Path [] files = { new Path("file1") }; long [] lengths = { 1 }; CombineFileSplit split = new CombineFileSplit(files, lengths); RecordReader rr = inputFormat.createRecordReader(split, context1); assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader); // Verify that the initial configuration is the one being used. // Right after construction the dummy key should have value "STATE1" assertEquals("Invalid initial dummy key value", "STATE1", rr.getCurrentKey().toString()); // Switch the active context for the RecordReader... Configuration conf2 = new Configuration(); conf2.set(DUMMY_KEY, "STATE2"); TaskAttemptContext context2 = new TaskAttemptContextImpl(conf2, taskId); rr.initialize(split, context2); // And verify that the new context is updated into the child record reader. assertEquals("Invalid secondary dummy key value", "STATE2", rr.getCurrentKey().toString()); }
/** * Test with no record length set. */ @Test (timeout=5000) public void testNoRecordLength() throws Exception { localFs.delete(workDir, true); Path file = new Path(workDir, new String("testFormat.txt")); createFile(file, null, 10, 10); // Create the job and do not set fixed record length Job job = Job.getInstance(defaultConf); FileInputFormat.setInputPaths(job, workDir); FixedLengthInputFormat format = new FixedLengthInputFormat(); List<InputSplit> splits = format.getSplits(job); boolean exceptionThrown = false; for (InputSplit split : splits) { try { TaskAttemptContext context = MapReduceTestUtil. createDummyMapTaskAttemptContext(job.getConfiguration()); RecordReader<LongWritable, BytesWritable> reader = format.createRecordReader(split, context); MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable> mcontext = new MapContextImpl<LongWritable, BytesWritable, LongWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); } catch(IOException ioe) { exceptionThrown = true; LOG.info("Exception message:" + ioe.getMessage()); } } assertTrue("Exception for not setting record length:", exceptionThrown); }
/** * Test with record length set to 0 */ @Test (timeout=5000) public void testZeroRecordLength() throws Exception { localFs.delete(workDir, true); Path file = new Path(workDir, new String("testFormat.txt")); createFile(file, null, 10, 10); Job job = Job.getInstance(defaultConf); // Set the fixed length record length config property FixedLengthInputFormat format = new FixedLengthInputFormat(); format.setRecordLength(job.getConfiguration(), 0); FileInputFormat.setInputPaths(job, workDir); List<InputSplit> splits = format.getSplits(job); boolean exceptionThrown = false; for (InputSplit split : splits) { try { TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext( job.getConfiguration()); RecordReader<LongWritable, BytesWritable> reader = format.createRecordReader(split, context); MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable> mcontext = new MapContextImpl<LongWritable, BytesWritable, LongWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); } catch(IOException ioe) { exceptionThrown = true; LOG.info("Exception message:" + ioe.getMessage()); } } assertTrue("Exception for zero record length:", exceptionThrown); }
/** * Test with record length set to a negative value */ @Test (timeout=5000) public void testNegativeRecordLength() throws Exception { localFs.delete(workDir, true); Path file = new Path(workDir, new String("testFormat.txt")); createFile(file, null, 10, 10); // Set the fixed length record length config property Job job = Job.getInstance(defaultConf); FixedLengthInputFormat format = new FixedLengthInputFormat(); format.setRecordLength(job.getConfiguration(), -10); FileInputFormat.setInputPaths(job, workDir); List<InputSplit> splits = format.getSplits(job); boolean exceptionThrown = false; for (InputSplit split : splits) { try { TaskAttemptContext context = MapReduceTestUtil. createDummyMapTaskAttemptContext(job.getConfiguration()); RecordReader<LongWritable, BytesWritable> reader = format.createRecordReader(split, context); MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable> mcontext = new MapContextImpl<LongWritable, BytesWritable, LongWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); } catch(IOException ioe) { exceptionThrown = true; LOG.info("Exception message:" + ioe.getMessage()); } } assertTrue("Exception for negative record length:", exceptionThrown); }
private static List<String> readSplit(FixedLengthInputFormat format, InputSplit split, Job job) throws Exception { List<String> result = new ArrayList<String>(); TaskAttemptContext context = MapReduceTestUtil. createDummyMapTaskAttemptContext(job.getConfiguration()); RecordReader<LongWritable, BytesWritable> reader = format.createRecordReader(split, context); MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable> mcontext = new MapContextImpl<LongWritable, BytesWritable, LongWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); LongWritable key; BytesWritable value; try { reader.initialize(split, mcontext); while (reader.nextKeyValue()) { key = reader.getCurrentKey(); value = reader.getCurrentValue(); result.add(new String(value.getBytes(), 0, value.getLength())); } } finally { reader.close(); } return result; }
/** * From each split sampled, take the first numSamples / numSplits records. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContextImpl( job.getConfiguration(), new TaskAttemptID()); RecordReader<K,V> reader = inf.createRecordReader( splits.get(i), samplingContext); reader.initialize(splits.get(i), samplingContext); while (reader.nextKeyValue()) { samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null)); ++records; if ((i+1) * samplesPerSplit <= records) { break; } } reader.close(); } return (K[])samples.toArray(); }
/** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContextImpl( job.getConfiguration(), new TaskAttemptID()); RecordReader<K,V> reader = inf.createRecordReader( splits.get(i), samplingContext); reader.initialize(splits.get(i), samplingContext); while (reader.nextKeyValue()) { ++records; if ((double) kept / records < freq) { samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null)); ++kept; } } reader.close(); } return (K[])samples.toArray(); }
/** * Construct a CompositeRecordReader for the children of this InputFormat * as defined in the init expression. * The outermost join need only be composable, not necessarily a composite. * Mandating TupleWritable isn't strictly correct. */ @SuppressWarnings("unchecked") // child types unknown public RecordReader<K,TupleWritable> createRecordReader(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException { setFormat(taskContext.getConfiguration()); return root.createRecordReader(split, taskContext); }
/** * Close all child RRs. */ public void close() throws IOException { if (kids != null) { for (RecordReader<K,? extends Writable> rr : kids) { rr.close(); } } if (jc != null) { jc.close(); } }
/** * Report progress as the minimum of all child RR progress. */ public float getProgress() throws IOException, InterruptedException { float ret = 1.0f; for (RecordReader<K,? extends Writable> rr : kids) { ret = Math.min(ret, rr.getProgress()); } return ret; }
protected RecordReader<LongWritable, T> createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException { @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); try { // use database product name to determine appropriate record reader. if (dbProductName.startsWith("ORACLE")) { // use Oracle-specific db reader. return new OracleDBRecordReader<T>(split, inputClass, conf, createConnection(), getDBConf(), conditions, fieldNames, tableName); } else if (dbProductName.startsWith("MYSQL")) { // use MySQL-specific db reader. return new MySQLDBRecordReader<T>(split, inputClass, conf, createConnection(), getDBConf(), conditions, fieldNames, tableName); } else { // Generic reader. return new DBRecordReader<T>(split, inputClass, conf, createConnection(), getDBConf(), conditions, fieldNames, tableName); } } catch (SQLException ex) { throw new IOException(ex.getMessage()); } }
protected RecordReader<LongWritable, T> createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException { DBConfiguration dbConf = getDBConf(); @SuppressWarnings("unchecked") Class<T> inputClass = (Class<T>) (dbConf.getInputClass()); String dbProductName = getDBProductName(); LOG.debug("Creating db record reader for db product: " + dbProductName); try { // use database product name to determine appropriate record reader. if (dbProductName.startsWith("MYSQL")) { // use MySQL-specific db reader. return new MySQLDataDrivenDBRecordReader<T>(split, inputClass, conf, createConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName()); } else { // Generic reader. return new DataDrivenDBRecordReader<T>(split, inputClass, conf, createConnection(), dbConf, dbConf.getInputConditions(), dbConf.getInputFieldNames(), dbConf.getInputTableName(), dbProductName); } } catch (SQLException ex) { throw new IOException(ex.getMessage()); } }
ChainMapContextImpl( TaskInputOutputContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> base, RecordReader<KEYIN, VALUEIN> rr, RecordWriter<KEYOUT, VALUEOUT> rw, Configuration conf) { this.reader = rr; this.output = rw; this.base = base; this.conf = conf; }