/** * Run the test * * @throws IOException on error */ public static void runTests() throws IOException { config.setLong("io.bytes.per.checksum", bytesPerChecksum); JobConf job = new JobConf(config, NNBench.class); job.setJobName("NNBench-" + operation); FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); // Explicitly set number of max map attempts to 1. job.setMaxMapAttempts(1); // Explicitly turn off speculative execution job.setSpeculativeExecution(false); job.setMapperClass(NNBenchMapper.class); job.setReducerClass(NNBenchReducer.class); FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks((int) numberOfReduces); JobClient.runJob(job); }
private void runIOTest( Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, Path outputDir) throws IOException { JobConf job = new JobConf(config, TestDFSIO.class); FileInputFormat.setInputPaths(job, getControlDir(config)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass); job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
/** * Sets up a job conf for the given job using the given config object. Ensures * that the correct input format is set, the mapper and and reducer class and * the input and output keys and value classes along with any other job * configuration. * * @param config * @return JobConf representing the job to be ran * @throws IOException */ private JobConf getJob(ConfigExtractor config) throws IOException { JobConf job = new JobConf(config.getConfig(), SliveTest.class); job.setInputFormat(DummyInputFormat.class); FileOutputFormat.setOutputPath(job, config.getOutputPath()); job.setMapperClass(SliveMapper.class); job.setPartitionerClass(SlivePartitioner.class); job.setReducerClass(SliveReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setCompressOutput(job, false); job.setNumReduceTasks(config.getReducerAmount()); job.setNumMapTasks(config.getMapAmount()); return job; }
@Override public void close() throws IOException { // Output the result to a file Results in the output dir FileContext fc; try { fc = FileContext.getFileContext(jobConf); } catch (IOException ioe) { System.err.println("Can not initialize the file system: " + ioe.getLocalizedMessage()); return; } FSDataOutputStream o = fc.create(FileOutputFormat.getTaskOutputPath(jobConf, "Results"), EnumSet.of(CreateFlag.CREATE)); PrintStream out = new PrintStream(o); printResults(out); out.close(); o.close(); }
private static void joinAs(String jointype, Class<? extends SimpleCheckerBase> c) throws Exception { final int srcs = 4; Configuration conf = new Configuration(); JobConf job = new JobConf(conf, c); Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype)); Path[] src = writeSimpleSrc(base, conf, srcs); job.set("mapreduce.join.expr", CompositeInputFormat.compose(jointype, SequenceFileInputFormat.class, src)); job.setInt("testdatamerge.sources", srcs); job.setInputFormat(CompositeInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(base, "out")); job.setMapperClass(c); job.setReducerClass(c); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); base.getFileSystem(job).delete(base, true); }
public void testEmptyJoin() throws Exception { JobConf job = new JobConf(); Path base = cluster.getFileSystem().makeQualified(new Path("/empty")); Path[] src = { new Path(base,"i0"), new Path("i1"), new Path("i2") }; job.set("mapreduce.join.expr", CompositeInputFormat.compose("outer", Fake_IF.class, src)); job.setInputFormat(CompositeInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(base, "out")); job.setMapperClass(IdentityMapper.class); job.setReducerClass(IdentityReducer.class); job.setOutputKeyClass(IncomparableKey.class); job.setOutputValueClass(NullWritable.class); JobClient.runJob(job); base.getFileSystem(job).delete(base, true); }
/** * Creates a simple copy job. * * @param indirs List of input directories. * @param outdir Output directory. * @return JobConf initialised for a simple copy job. * @throws Exception If an error occurs creating job configuration. */ static JobConf createCopyJob(List<Path> indirs, Path outdir) throws Exception { Configuration defaults = new Configuration(); JobConf theJob = new JobConf(defaults, TestJobControl.class); theJob.setJobName("DataMoveJob"); FileInputFormat.setInputPaths(theJob, indirs.toArray(new Path[0])); theJob.setMapperClass(DataCopy.class); FileOutputFormat.setOutputPath(theJob, outdir); theJob.setOutputKeyClass(Text.class); theJob.setOutputValueClass(Text.class); theJob.setReducerClass(DataCopy.class); theJob.setNumMapTasks(12); theJob.setNumReduceTasks(4); return theJob; }
public void configure(JobConf conf) { this.conf = conf; tmpOutputDir = FileOutputFormat.getWorkOutputPath(this.conf); masterIndex = new Path(tmpOutputDir, "_masterindex"); index = new Path(tmpOutputDir, "_index"); try { fs = masterIndex.getFileSystem(conf); if (fs.exists(masterIndex)) { fs.delete(masterIndex, false); } if (fs.exists(index)) { fs.delete(index, false); } indexStream = fs.create(index); outStream = fs.create(masterIndex); String version = VERSION + " \n"; outStream.write(version.getBytes(Charsets.UTF_8)); } catch(IOException e) { throw new RuntimeException(e); } }
/** * @param args * @return the JobConf * @throws IOException */ public JobConf createSubmittableJob(String[] args) throws IOException { JobConf c = new JobConf(getConf(), getClass()); c.setJobName(NAME); // Columns are space delimited StringBuilder sb = new StringBuilder(); final int columnoffset = 2; for (int i = columnoffset; i < args.length; i++) { if (i > columnoffset) { sb.append(" "); } sb.append(args[i]); } // Second argument is the table name. TableMapReduceUtil.initTableMapJob(args[1], sb.toString(), RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, c); c.setNumReduceTasks(0); // First arg is the output directory. FileOutputFormat.setOutputPath(c, new Path(args[0])); return c; }
@Override protected void runJob(String jobName, Configuration c, List<Scan> scans) throws IOException, InterruptedException, ClassNotFoundException { JobConf job = new JobConf(TEST_UTIL.getConfiguration()); job.setJobName(jobName); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); TableMapReduceUtil.initMultiTableSnapshotMapperJob(getSnapshotScanMapping(scans), Mapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job, true, restoreDir); TableMapReduceUtil.addDependencyJars(job); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); // one to get final "first" and "last" key FileOutputFormat.setOutputPath(job, new Path(job.getJobName())); LOG.info("Started " + job.getJobName()); RunningJob runningJob = JobClient.runJob(job); runningJob.waitForCompletion(); assertTrue(runningJob.isSuccessful()); LOG.info("After map/reduce completion - job " + jobName); }
private static JobConf getOldAPIJobconf(Configuration configuration, String name, String input, String output) throws Exception { final JobConf jobConf = new JobConf(configuration); final FileSystem fs = FileSystem.get(configuration); if (fs.exists(new Path(output))) { fs.delete(new Path(output), true); } fs.close(); jobConf.setJobName(name); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(IntWritable.class); jobConf.setMapperClass(WordCountWithOldAPI.TokenizerMapperWithOldAPI.class); jobConf.setCombinerClass(WordCountWithOldAPI.IntSumReducerWithOldAPI.class); jobConf.setReducerClass(WordCountWithOldAPI.IntSumReducerWithOldAPI.class); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(jobConf, new Path(input)); FileOutputFormat.setOutputPath(jobConf, new Path(output)); return jobConf; }
public void configure(JobConf conf) { this.conf = conf; tmpOutputDir = FileOutputFormat.getWorkOutputPath(this.conf); masterIndex = new Path(tmpOutputDir, "_masterindex"); index = new Path(tmpOutputDir, "_index"); replication = conf.getInt(HAR_REPLICATION_LABEL, 3); try { fs = masterIndex.getFileSystem(conf); if (fs.exists(masterIndex)) { fs.delete(masterIndex, false); } if (fs.exists(index)) { fs.delete(index, false); } indexStream = fs.create(index); outStream = fs.create(masterIndex); String version = VERSION + " \n"; outStream.write(version.getBytes(Charsets.UTF_8)); } catch(IOException e) { throw new RuntimeException(e); } }
@Override public RecordWriter<NullWritable, DynamoDBItemWritable> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null; String extension = ""; DataOutputStream fileOut; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = ReflectionUtils.newInstance(codecClass, job); extension = codec.getDefaultExtension(); } Path file = new Path(FileOutputFormat.getOutputPath(job), name + extension); FileSystem fs = file.getFileSystem(job); if (!isCompressed) { fileOut = fs.create(file, progress); } else { fileOut = new DataOutputStream(codec.createOutputStream(fs.create(file, progress))); } return new ExportRecordWriter(fileOut); }
public RecordWriter<WritableComparable<?>, Writable> getRecordWriter( final FileSystem fs, JobConf job, String name, final Progressable progress) throws IOException { final Path segmentDumpFile = new Path( FileOutputFormat.getOutputPath(job), name); // Get the old copy out of the way if (fs.exists(segmentDumpFile)) fs.delete(segmentDumpFile, true); final PrintStream printStream = new PrintStream( fs.create(segmentDumpFile)); return new RecordWriter<WritableComparable<?>, Writable>() { public synchronized void write(WritableComparable<?> key, Writable value) throws IOException { printStream.println(value); } public synchronized void close(Reporter reporter) throws IOException { printStream.close(); } }; }
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("LinkDb merge: starting at " + sdf.format(start)); JobConf job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME)); } JobClient.runJob(job); FileSystem fs = FileSystem.get(getConf()); fs.mkdirs(output); fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME)); long end = System.currentTimeMillis(); LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { Path newLinkDb = new Path("linkdb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb merge " + linkDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LinkDbFilter.class); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setReducerClass(LinkDbMerger.class); FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); // https://issues.apache.org/jira/browse/NUTCH-1069 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); return job; }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCountOldAPI.class); conf.setJobName("old wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
public int run(String[] argv) throws IOException { if (argv.length < 2) { System.out.println("ExternalMapReduce <input> <output>"); return -1; } Path outDir = new Path(argv[1]); Path input = new Path(argv[0]); JobConf testConf = new JobConf(getConf(), ExternalMapReduce.class); //try to load a class from libjar try { testConf.getClassByName("testjar.ClassWordCount"); } catch (ClassNotFoundException e) { System.out.println("Could not find class from libjar"); return -1; } testConf.setJobName("external job"); FileInputFormat.setInputPaths(testConf, input); FileOutputFormat.setOutputPath(testConf, outDir); testConf.setMapperClass(MapClass.class); testConf.setReducerClass(Reduce.class); testConf.setNumReduceTasks(1); JobClient.runJob(testConf); return 0; }
public void configure(String keySpec, int expect) throws Exception { Path testdir = new Path(TEST_DIR.getAbsolutePath()); Path inDir = new Path(testdir, "in"); Path outDir = new Path(testdir, "out"); FileSystem fs = getFileSystem(); fs.delete(testdir, true); conf.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(LongWritable.class); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class); conf.setKeyFieldComparatorOptions(keySpec); conf.setKeyFieldPartitionerOptions("-k1.1,1.1"); conf.set(JobContext.MAP_OUTPUT_KEY_FIELD_SEPERATOR, " "); conf.setMapperClass(InverseMapper.class); conf.setReducerClass(IdentityReducer.class); if (!fs.mkdirs(testdir)) { throw new IOException("Mkdirs failed to create " + testdir.toString()); } if (!fs.mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.toString()); } // set up input data in 2 files Path inFile = new Path(inDir, "part0"); FileOutputStream fos = new FileOutputStream(inFile.toString()); fos.write((line1 + "\n").getBytes()); fos.write((line2 + "\n").getBytes()); fos.close(); JobClient jc = new JobClient(conf); RunningJob r_job = jc.submitJob(conf); while (!r_job.isComplete()) { Thread.sleep(1000); } if (!r_job.isSuccessful()) { fail("Oops! The job broke due to an unexpected error"); } Path[] outputFiles = FileUtil.stat2Paths( getFileSystem().listStatus(outDir, new Utils.OutputFileUtils.OutputFilesFilter())); if (outputFiles.length > 0) { InputStream is = getFileSystem().open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line = reader.readLine(); //make sure we get what we expect as the first line, and also //that we have two lines if (expect == 1) { assertTrue(line.startsWith(line1)); } else if (expect == 2) { assertTrue(line.startsWith(line2)); } line = reader.readLine(); if (expect == 1) { assertTrue(line.startsWith(line2)); } else if (expect == 2) { assertTrue(line.startsWith(line1)); } reader.close(); } }
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, int numReds) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(conf); if (fs.exists(outDir)) { fs.delete(outDir, true); } if (!fs.exists(inDir)) { fs.mkdirs(inDir); } String input = "The quick brown fox\n" + "has many silly\n" + "red fox sox\n"; for (int i = 0; i < numMaps; ++i) { DataOutputStream file = fs.create(new Path(inDir, "part-" + i)); file.writeBytes(input); file.close(); } DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs); conf.setOutputCommitter(CustomOutputCommitter.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); JobClient jobClient = new JobClient(conf); RunningJob job = jobClient.submitJob(conf); return jobClient.monitorAndPrintJob(conf, job); }
@Test public void testCombinerShouldUpdateTheReporter() throws Exception { JobConf conf = new JobConf(mrCluster.getConfig()); int numMaps = 5; int numReds = 2; Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(), "testCombinerShouldUpdateTheReporter-in"); Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(), "testCombinerShouldUpdateTheReporter-out"); createInputOutPutFolder(in, out, numMaps); conf.setJobName("test-job-with-combiner"); conf.setMapperClass(IdentityMapper.class); conf.setCombinerClass(MyCombinerToCheckReporter.class); //conf.setJarByClass(MyCombinerToCheckReporter.class); conf.setReducerClass(IdentityReducer.class); DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf); conf.setOutputCommitter(CustomOutputCommitter.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, in); FileOutputFormat.setOutputPath(conf, out); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); runJob(conf); }
public void configure(JobConf conf) { this.conf = conf; // this is tightly tied to map reduce // since it does not expose an api // to get the partition partId = conf.getInt(MRJobConfig.TASK_PARTITION, -1); // create a file name using the partition // we need to write to this directory tmpOutputDir = FileOutputFormat.getWorkOutputPath(conf); blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, blockSize); // get the output path and write to the tmp // directory partname = "part-" + partId; tmpOutput = new Path(tmpOutputDir, partname); rootPath = (conf.get(SRC_PARENT_LABEL, null) == null) ? null : new Path(conf.get(SRC_PARENT_LABEL)); if (rootPath == null) { throw new RuntimeException("Unable to read parent " + "path for har from config"); } try { destFs = tmpOutput.getFileSystem(conf); //this was a stale copy if (destFs.exists(tmpOutput)) { destFs.delete(tmpOutput, false); } partStream = destFs.create(tmpOutput, false, conf.getInt("io.file.buffer.size", 4096), destFs.getDefaultReplication(tmpOutput), blockSize); } catch(IOException ie) { throw new RuntimeException("Unable to open output file " + tmpOutput, ie); } buffer = new byte[buf_size]; }
public void configure(JobConf conf) { this.conf = conf; replication = conf.getInt(HAR_REPLICATION_LABEL, 3); // this is tightly tied to map reduce // since it does not expose an api // to get the partition partId = conf.getInt(MRJobConfig.TASK_PARTITION, -1); // create a file name using the partition // we need to write to this directory tmpOutputDir = FileOutputFormat.getWorkOutputPath(conf); blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, blockSize); // get the output path and write to the tmp // directory partname = "part-" + partId; tmpOutput = new Path(tmpOutputDir, partname); rootPath = (conf.get(SRC_PARENT_LABEL, null) == null) ? null : new Path(conf.get(SRC_PARENT_LABEL)); if (rootPath == null) { throw new RuntimeException("Unable to read parent " + "path for har from config"); } try { destFs = tmpOutput.getFileSystem(conf); //this was a stale copy if (destFs.exists(tmpOutput)) { destFs.delete(tmpOutput, false); } partStream = destFs.create(tmpOutput, false, conf.getInt("io.file.buffer.size", 4096), destFs.getDefaultReplication(tmpOutput), blockSize); } catch(IOException ie) { throw new RuntimeException("Unable to open output file " + tmpOutput, ie); } buffer = new byte[buf_size]; }
public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { Path out = FileOutputFormat.getOutputPath(job); if ((out == null) && (job.getNumReduceTasks() != 0)) { throw new InvalidJobConfException( "Output directory not set in JobConf."); } if (fs == null) { fs = out.getFileSystem(job); } if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME))) throw new IOException("Segment already parsed!"); }
private void createBayesData() throws IOException, URISyntaxException { log.info("creating bayes text data ... "); JobConf job = new JobConf(); Path fout = options.getResultPath(); Utils.checkHdfsPath(fout); String jobname = "Create bayes data"; job.setJobName(jobname); Utils.shareDict(options, job); setBayesOptions(job); FileInputFormat.setInputPaths(job, dummy.getPath()); job.setInputFormat(NLineInputFormat.class); job.setJarByClass(CreateBayesPages.class); job.setMapperClass(CreateBayesPages.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, fout); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); log.info("Running Job: " +jobname); log.info("Pages file " + dummy.getPath() + " as input"); log.info("Rankings file " + fout + " as output"); JobClient.runJob(job); log.info("Finished Running Job: " + jobname); }