void testInputFormat(Class<? extends InputFormat> clazz) throws IOException { final JobConf job = MapreduceTestingShim.getJobConf(mrCluster); job.setInputFormat(clazz); job.setOutputFormat(NullOutputFormat.class); job.setMapperClass(ExampleVerifier.class); job.setNumReduceTasks(0); LOG.debug("submitting job."); final RunningJob run = JobClient.runJob(job); assertTrue("job failed!", run.isSuccessful()); assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter()); assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter()); assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter()); assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter()); assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter()); assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter()); }
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), SleepJob.class); job.setNumMapTasks(numMapper); job.setNumReduceTasks(numReducer); job.setMapperClass(SleepJob.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SleepJob.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(SleepJob.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
public void configure(JobConf job) { // Set the mapper and reducers job.setMapperClass(ReadDataJob.TestMapper.class); // Make sure this jar is included job.setJarByClass(ReadDataJob.TestMapper.class); // Specify the input and output data formats job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); // Turn off speculative execution job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); // Add the job input path FileInputFormat.addInputPath(job, new Path(this.input_path)); }
public void configure(JobConf job) { // Set the mapper and reducers job.setMapperClass(TestMapper.class); // job.setReducerClass(TestReducer.class); // Set the output types of the mapper and reducer // job.setMapOutputKeyClass(IntWritable.class); // job.setMapOutputValueClass(NullWritable.class); // job.setOutputKeyClass(NullWritable.class); // job.setOutputValueClass(NullWritable.class); // Make sure this jar is included job.setJarByClass(TestMapper.class); // Specify the input and output data formats job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); // Turn off speculative execution job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); // Add the job input path FileInputFormat.addInputPath(job, new Path(this.input_filename)); }
private static void runJvmReuseTest(JobConf job, boolean reuse) throws IOException { // setup a map-only job that reads the input and only sets the counters // based on how many times the jvm was reused. job.setInt("mapred.job.reuse.jvm.num.tasks", reuse ? -1 : 1); FileInputFormat.setInputPaths(job, SORT_INPUT_PATH); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapperClass(ReuseDetector.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumMapTasks(24); job.setNumReduceTasks(0); RunningJob result = JobClient.runJob(job); long uses = result.getCounters().findCounter("jvm", "use").getValue(); int maps = job.getNumMapTasks(); if (reuse) { assertTrue("maps = " + maps + ", uses = " + uses, maps < uses); } else { assertEquals("uses should be number of maps", job.getNumMapTasks(), uses); } }
private static void runTest(String name, int keylen, int vallen, int records, int ioSortMB, float recPer, float spillPer, boolean pedantic) throws Exception { JobConf conf = new JobConf(new Configuration(), SpillMapper.class); conf.setInt("io.sort.mb", ioSortMB); conf.set("io.sort.record.percent", Float.toString(recPer)); conf.set("io.sort.spill.percent", Float.toString(spillPer)); conf.setInt("test.keywritable.length", keylen); conf.setInt("test.valwritable.length", vallen); conf.setInt("test.spillmap.records", records); conf.setBoolean("test.pedantic.verification", pedantic); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setInputFormat(FakeIF.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(SpillMapper.class); conf.setReducerClass(SpillReducer.class); conf.setMapOutputKeyClass(KeyWritable.class); conf.setMapOutputValueClass(ValWritable.class); LOG.info("Running " + name); JobClient.runJob(conf); }
public void dedup(String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrDeleteDuplicates: starting at " + sdf.format(start)); LOG.info("SolrDeleteDuplicates: Solr url: " + solrUrl); JobConf job = new NutchJob(getConf()); job.set(SolrConstants.SERVER_URL, solrUrl); job.setBoolean("noCommit", noCommit); job.setInputFormat(SolrInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(SolrRecord.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(SolrDeleteDuplicates.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("SolrDeleteDuplicates: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrClean: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.set(SolrConstants.SERVER_URL, solrUrl); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(SolrDeleter.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
void testInputFormat(Class<? extends InputFormat> clazz) throws IOException { Configuration conf = UTIL.getConfiguration(); final JobConf job = new JobConf(conf); job.setInputFormat(clazz); job.setOutputFormat(NullOutputFormat.class); job.setMapperClass(ExampleVerifier.class); job.setNumReduceTasks(0); LOG.debug("submitting job."); final RunningJob run = JobClient.runJob(job); assertTrue("job failed!", run.isSuccessful()); assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter()); assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter()); assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter()); assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter()); assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter()); assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter()); }
private void configureJob(JobConf conf) { conf.setJobName("History"); conf.setInputFormat(TextInputFormat.class); conf.setMapOutputKeyClass(LongWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(org.apache.hadoop.mapred.lib.IdentityMapper.class); conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class); FileInputFormat.setInputPaths(conf, "/tmp/input"); }
private static void runJvmReuseTest(JobConf job, boolean reuse) throws IOException { // setup a map-only job that reads the input and only sets the counters // based on how many times the jvm was reused. job.setInt(JobContext.JVM_NUMTASKS_TORUN, reuse ? -1 : 1); FileInputFormat.setInputPaths(job, SORT_INPUT_PATH); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapperClass(ReuseDetector.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumMapTasks(24); job.setNumReduceTasks(0); RunningJob result = JobClient.runJob(job); long uses = result.getCounters().findCounter("jvm", "use").getValue(); int maps = job.getNumMapTasks(); if (reuse) { assertTrue("maps = " + maps + ", uses = " + uses, maps < uses); } else { assertEquals("uses should be number of maps", job.getNumMapTasks(), uses); } }
public JobConf setupJobConf() { JobConf job = new JobConf(getConf(), MyDummyJob.class); job.setNumMapTasks(1); job.setNumReduceTasks(1); job.setMapperClass(MyDummyJob.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(MyDummyJob.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(EmptyInputFormat.class); job.setPartitionerClass(MyDummyJob.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); populateTokens(job); return job; }