public static void run(Configuration conf, Path inputPath, Path output, double params) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "calculating parameter"; conf.set("params",String.valueOf(params)); Job job = new Job(conf, jobName); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(indexToCountWritable.class); job.setOutputKeyClass(twoDimensionIndexWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(CalParamsMapper.class); job.setReducerClass(CalParamsReducer.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job,output); job.setJarByClass(LDADriver.class); if (!job.waitForCompletion(true)) { throw new InterruptedException("calculating parameter failed"); } }
public static void runJob(Path input, Path output, String vectorClassName,Configuration config) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = config; conf.set("vector.implementation.class.name", vectorClassName); Job job = new Job(conf, "Input Driver running over input: " + input); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(InputMapper.class); job.setNumReduceTasks(0); job.setJarByClass(InputDriver.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); }
@Override protected Class<? extends OutputFormat> getOutputFormatClass() throws ClassNotFoundException { if (isHCatJob) { LOG.debug("Returning HCatOutputFormat for output format"); return SqoopHCatUtilities.getOutputFormatClass(); } if (options.getFileLayout() == SqoopOptions.FileLayout.TextFile) { return RawKeyTextOutputFormat.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) { return SequenceFileOutputFormat.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) { return AvroOutputFormat.class; } else if (options.getFileLayout() == SqoopOptions.FileLayout.ParquetFile) { return DatasetKeyOutputFormat.class; } return null; }
private static void joinAs(String jointype, Class<? extends SimpleCheckerMapBase<?>> map, Class<? extends SimpleCheckerReduceBase> reduce) throws Exception { final int srcs = 4; Configuration conf = new Configuration(); Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype)); Path[] src = writeSimpleSrc(base, conf, srcs); conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype, SequenceFileInputFormat.class, src)); conf.setInt("testdatamerge.sources", srcs); Job job = Job.getInstance(conf); job.setInputFormatClass(CompositeInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(base, "out")); job.setMapperClass(map); job.setReducerClass(reduce); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.waitForCompletion(true); assertTrue("Job failed", job.isSuccessful()); if ("outer".equals(jointype)) { checkOuterConsistency(job, src); } base.getFileSystem(conf).delete(base, true); }
public Job createJob(Configuration conf) throws IOException { long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024); long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; conf.setLong(BYTES_PER_MAP, totalBytesToWrite); } conf.setInt(MRJobConfig.NUM_MAPS, numMaps); Job job = Job.getInstance(conf); job.setJarByClass(RandomTextWriterJob.class); job.setJobName("random-text-writer"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RandomInputFormat.class); job.setMapperClass(RandomTextMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); //FileOutputFormat.setOutputPath(job, new Path("random-output")); job.setNumReduceTasks(0); return job; }
public static void runJob(Configuration conf, Path inputPath, Path output) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(conf, "Input Drive running input:"+inputPath); log.info("start running InputDriver"); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(indexToWordWritable.class); job.setOutputKeyClass(twoDimensionIndexWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(InputMapper.class); job.setReducerClass(InputReducer.class); job.setNumReduceTasks(1); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(InputDriver.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
@Override public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException { if (rdd.isEmpty()) { log.info("RDD was empty, not saving to HDFS"); } else { String file = prefix + "-" + time.milliseconds() + "." + suffix; Path path = new Path(file); FileSystem fs = FileSystem.get(path.toUri(), hadoopConf); if (fs.exists(path)) { log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path); fs.delete(path, true); } log.info("Saving RDD to HDFS at {}", file); rdd.mapToPair( new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass) ).saveAsNewAPIHadoopFile( file, keyWritableClass, messageWritableClass, SequenceFileOutputFormat.class, hadoopConf); } }
/** * Sets up the actual job. * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; Path outputDir = new Path(args[1]); Job job = new Job(conf, NAME + "_" + tableName); job.setJobName(NAME + "_" + tableName); job.setJarByClass(Export.class); // Set optional scan parameters Scan s = getConfiguredScanForJob(conf, args); IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job); // No reducers. Just write straight to output files. job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Result.class); FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs. return job; }
public int runRandomInputGenerator(int numMappers, long numNodes, Path tmpOutput, Integer width, Integer wrapMuplitplier) throws Exception { LOG.info("Running RandomInputGenerator with numMappers=" + numMappers + ", numNodes=" + numNodes); Job job = Job.getInstance(getConf()); job.setJobName("Random Input Generator"); job.setNumReduceTasks(0); job.setJarByClass(getClass()); job.setInputFormatClass(GeneratorInputFormat.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(NullWritable.class); setJobConf(job, numMappers, numNodes, width, wrapMuplitplier); job.setMapperClass(Mapper.class); //identity mapper FileOutputFormat.setOutputPath(job, tmpOutput); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean success = jobCompletion(job); return success ? 0 : 1; }
public static void Run(String output, String outputFormat, int reducerNum, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(conf); // job.setJobName(Es2Json.class.getName()); job.setJarByClass(Es2Json.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(MapTask.class); job.setReducerClass(ReduceTask.class); job.setInputFormatClass(EsInputFormat.class); if (outputFormat.equals("sequencefile")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } job.setNumReduceTasks(reducerNum); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(output)); job.setSpeculativeExecution(false); job.waitForCompletion(true); }
@Override public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, JavaPairRDD<K, V> memoryRDD) { final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration); final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION); if (null != outputLocation) { // map back to a Hadoop stream for output memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2()))) .saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey), ObjectWritable.class, ObjectWritable.class, SequenceFileOutputFormat.class, hadoopConfiguration); try { return (Iterator) new ObjectWritableIterator(hadoopConfiguration, new Path(Constants.getMemoryLocation(outputLocation, memoryKey))); } catch (final IOException e) { throw new IllegalStateException(e.getMessage(), e); } } return Collections.emptyIterator(); }
private boolean bigItemCount(String output) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(this.getConf(), "Counting items from " + this.input); job.setJarByClass(TopPIoverHadoop.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(this.input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setMapperClass(ItemBigCountingMapper.class); job.setReducerClass(ItemBigCountingReducer.class); boolean success = job.waitForCompletion(true); if (success) { Counter rebasingMaxID = job.getCounters().findCounter(TaskCounter.REDUCE_OUTPUT_RECORDS); this.getConf().setInt(KEY_REBASING_MAX_ID, (int) rebasingMaxID.getValue()); } return success; }
private boolean genBigItemMap(String input, String output) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(this.getConf(), "Computing items remapping for " + this.input); job.setJarByClass(TopPIoverHadoop.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setMapperClass(InverseMapper.class); job.setReducerClass(ItemBigRebasingReducer.class); job.setNumReduceTasks(1); return job.waitForCompletion(true); }
private boolean filterInput(String output, String rebasingMapPath) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(this.getConf(), "Computing items remapping for " + this.input); job.setJarByClass(TopPIoverHadoop.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(ConcatenatedTransactionsWritable.class); DistCache.copyToCache(job, rebasingMapPath); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setMapperClass(FilteringMapper.class); job.setNumReduceTasks(0); return job.waitForCompletion(true); }
private static JavaRDD<Tuple3<String, String, Float>> handleRelations(JavaRDD<DocumentToSoftwareUrlWithMeta> documentToSoftwareUrl, String actionSetId, Configuration jobConfig, String outputAvroPath) { JavaRDD<Tuple3<String, String, Float>> distinctRelationTriples = documentToSoftwareUrl .map(e -> new Tuple3<>(e.getDocumentId().toString(), generateSoftwareEntityId(pickUrl(e)), e.getConfidenceLevel())) .distinct(); JavaPairRDD<String, Tuple3<String, String, Float>> relationTriplesByIdPair = distinctRelationTriples .mapToPair(e -> new Tuple2<String, Tuple3<String, String, Float>>( joinDocumentAndSoftwareIds(e._1(), e._2()), e)); JavaRDD<Tuple3<String, String, Float>> dedupedRelationTriples = relationTriplesByIdPair .reduceByKey((x, y) -> pickBestConfidence(x, y)).values(); // to be used by both entity exporter and reporter consumers dedupedRelationTriples.cache(); JavaPairRDD<Text, Text> relationResult = dedupedRelationTriples.flatMapToPair(x -> (Iterable<Tuple2<Text, Text>>) buildRelationActions(x._1(), x._2(), x._3(), actionSetId).stream() .map(action -> new Tuple2<Text, Text>(new Text(action.getRowKey()), new Text(action.toString())))::iterator); relationResult.coalesce(numberOfOutputFiles).saveAsNewAPIHadoopFile(outputAvroPath, Text.class, Text.class, SequenceFileOutputFormat.class, jobConfig); return dedupedRelationTriples; }
public int runRandomInputGenerator(int numMappers, long numNodes, Path tmpOutput, Integer width, Integer wrapMuplitplier) throws Exception { LOG.info("Running RandomInputGenerator with numMappers=" + numMappers + ", numNodes=" + numNodes); Job job = new Job(getConf()); job.setJobName("Random Input Generator"); job.setNumReduceTasks(0); job.setJarByClass(getClass()); job.setInputFormatClass(GeneratorInputFormat.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(NullWritable.class); setJobConf(job, numMappers, numNodes, width, wrapMuplitplier); job.setMapperClass(Mapper.class); //identity mapper FileOutputFormat.setOutputPath(job, tmpOutput); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
public void run(Configuration conf, Path matrixInputPath, String meanSpanFileName, Path matrixOutputPath, double sampleRate) throws IOException, InterruptedException, ClassNotFoundException { conf.set(MEANSPANOPTION, meanSpanFileName); conf.setFloat(SAMPLERATE, (float) sampleRate); Job job = new Job(conf); job.setJobName("Normalize"); job.setJarByClass(NormalizeJob.class); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(NormalizeMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); job.waitForCompletion(true); }
public Job createJob(Configuration conf) throws IOException { long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024); long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; conf.setLong(BYTES_PER_MAP, totalBytesToWrite); } conf.setInt(MRJobConfig.NUM_MAPS, numMaps); Job job = new Job(conf); job.setJarByClass(RandomTextWriterJob.class); job.setJobName("random-text-writer"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RandomInputFormat.class); job.setMapperClass(RandomTextMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); //FileOutputFormat.setOutputPath(job, new Path("random-output")); job.setNumReduceTasks(0); return job; }
private void setupMapper(CubeSegment cubeSeg) throws IOException { // set the segment's offset info to job conf Map<Integer, Long> offsetStart = cubeSeg.getSourcePartitionOffsetStart(); Map<Integer, Long> offsetEnd = cubeSeg.getSourcePartitionOffsetEnd(); Integer minPartition = Collections.min(offsetStart.keySet()); Integer maxPartition = Collections.max(offsetStart.keySet()); job.getConfiguration().set(CONFIG_KAFKA_PARITION_MIN, minPartition.toString()); job.getConfiguration().set(CONFIG_KAFKA_PARITION_MAX, maxPartition.toString()); for(Integer partition: offsetStart.keySet()) { job.getConfiguration().set(CONFIG_KAFKA_PARITION_START + partition, offsetStart.get(partition).toString()); job.getConfiguration().set(CONFIG_KAFKA_PARITION_END + partition, offsetEnd.get(partition).toString()); } job.setMapperClass(KafkaFlatTableMapper.class); job.setInputFormatClass(KafkaInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(0); }
@Override protected void configureJob(Job job) throws IOException { Configuration conf = job.getConfiguration(); job.setJarByClass(PartialBuilder.class); FileInputFormat.setInputPaths(job, getDataPath()); FileOutputFormat.setOutputPath(job, getOutputPath(conf)); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(MapredOutput.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(RuleBase.class); job.setMapperClass(ChiMapper.class); job.setReducerClass(ChiReducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); }
/** * Cluster the data using Hadoop */ private static void clusterDataMR(Path input, Path clustersIn, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set(STATE_IN_KEY, clustersIn.toString()); Job job = new Job(conf, "Mean Shift Driver running clusterData over input: " + input); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WeightedVectorWritable.class); job.setMapperClass(MeanShiftCanopyClusterMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(0); job.setJarByClass(MeanShiftCanopyDriver.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); if (!job.waitForCompletion(true)) { throw new InterruptedException( "Mean Shift Clustering failed on clustersIn " + clustersIn); } }
public static void runJob(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job = new Job(conf, "UnitVectorizerJob"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(UnitVectorizerMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(UnitVectorizerJob.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
/** * Convert vectors to MeanShiftCanopies using Hadoop */ private static void createCanopyFromVectorsMR(Configuration conf, Path input, Path output, DistanceMeasure measure) throws IOException, InterruptedException, ClassNotFoundException { conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass() .getName()); Job job = new Job(conf); job.setJarByClass(MeanShiftCanopyDriver.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ClusterWritable.class); job.setMapperClass(MeanShiftCanopyCreatorMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); if (!job.waitForCompletion(true)) { throw new InterruptedException( "Mean Shift createCanopyFromVectorsMR failed on input " + input); } }
private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "Calculating perplexity for " + modelPath; log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CachingCVB0PerplexityMapper.class); job.setMapperClass(CachingCVB0PerplexityMapper.class); job.setCombinerClass(DualDoubleSumReducer.class); job.setReducerClass(DualDoubleSumReducer.class); job.setNumReduceTasks(1); job.setOutputKeyClass(DoubleWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, corpusPath); Path outputPath = perplexityPath(modelPath.getParent(), iteration); FileOutputFormat.setOutputPath(job, outputPath); setModelPaths(job, modelPath); HadoopUtil.delete(conf, outputPath); if (!job.waitForCompletion(true)) { throw new InterruptedException("Failed to calculate perplexity for: " + modelPath); } return readPerplexity(conf, modelPath.getParent(), iteration); }
private static Job writeTopicModel(Configuration conf, Path modelInput, Path output) throws IOException, InterruptedException, ClassNotFoundException { String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CVB0Driver.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CVB0TopicTermVectorNormalizerMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, modelInput); FileOutputFormat.setOutputPath(job, output); job.submit(); return job; }
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setMapperClass(CVB0DocInferenceMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); FileSystem fs = FileSystem.get(corpus.toUri(), conf); if (modelInput != null && fs.exists(modelInput)) { FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); URI[] modelUris = new URI[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelUris[i] = statuses[i].getPath().toUri(); } DistributedCache.setCacheFiles(modelUris, conf); } FileInputFormat.addInputPath(job, corpus); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(CVB0Driver.class); job.submit(); return job; }
@Override public void createVectors(Path input, Path output, VectorizerConfig config) throws IOException, ClassNotFoundException, InterruptedException { //do this for convenience of using prepareJob Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, EncodingMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, config.getConf()); Configuration conf = job.getConfiguration(); conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess())); conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors())); conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName()); conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName()); conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass()); conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality())); job.setNumReduceTasks(0); boolean finished = job.waitForCompletion(true); log.info("result of run: {}", finished); if (!finished) { throw new IllegalStateException("Job failed!"); } }
/** * Process as a map reduce job. The numberOfReduceTasks is set to the number of clusters present in the * output. So that each cluster's vector is written in its own part file. * * @param conf * The hadoop configuration. * @param input * The output path provided to the clustering algorithm, whose would be post processed. Hint : The * path of the directory containing clusters-*-final and clusteredPoints. * @param output * The post processed data would be stored at this path. */ private static void postProcessMR(Configuration conf, Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(conf, "ClusterOutputPostProcessor Driver running over input: " + input); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(ClusterOutputPostProcessorMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setReducerClass(ClusterOutputPostProcessorReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); int numberOfClusters = ClusterCountReader.getNumberOfClusters(input, conf); job.setNumReduceTasks(numberOfClusters); job.setJarByClass(ClusterOutputPostProcessorDriver.class); FileInputFormat.addInputPath(job, new Path(input, new Path("clusteredPoints"))); FileOutputFormat.setOutputPath(job, output); if (!job.waitForCompletion(true)) { throw new InterruptedException("ClusterOutputPostProcessor Job failed processing " + input); } }
/** * Sets task classes with related info if needed into configuration object. * * @param job Configuration to change. * @param setMapper Option to set mapper and input format classes. * @param setCombiner Option to set combiner class. * @param setReducer Option to set reducer and output format classes. */ public static void setTasksClasses(Job job, boolean setMapper, boolean setCombiner, boolean setReducer, boolean outputCompression) { if (setMapper) { job.setMapperClass(HadoopWordCount2Mapper.class); job.setInputFormatClass(TextInputFormat.class); } if (setCombiner) job.setCombinerClass(HadoopWordCount2Combiner.class); if (setReducer) { job.setReducerClass(HadoopWordCount2Reducer.class); job.setOutputFormatClass(TextOutputFormat.class); } if (outputCompression) { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setCompressOutput(job, true); job.getConfiguration().set(FileOutputFormat.COMPRESS_CODEC, SnappyCodec.class.getName()); } }
public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput, int iterationNumber, int maxIterations, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations, modelInput); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CVB0Driver.class); job.setMapperClass(CachingCVB0Mapper.class); job.setCombinerClass(VectorSumReducer.class); job.setReducerClass(VectorSumReducer.class); job.setNumReduceTasks(numReduceTasks); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, corpusInput); FileOutputFormat.setOutputPath(job, modelOutput); setModelPaths(job, modelInput); HadoopUtil.delete(conf, modelOutput); if (!job.waitForCompletion(true)) { throw new InterruptedException(String.format("Failed to complete iteration %d stage 1", iterationNumber)); } }
public static Job createJob(Path[] inputPaths, Path outputPath, Map<String, String> metadata, Configuration config) throws IOException { final Job job = new Job(config); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); job.setOutputFormatClass(MetadataSequenceFileOutputFormat.class); SequenceFileInputFormat.setInputPaths(job, inputPaths); SequenceFileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); if (metadata != null) MetadataConfiguration.setMetadata(metadata, job.getConfiguration()); return job; }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Job job = Job.getInstance(getConf()); job.setMapperClass(SortGroupResultMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // Sort data by total number: job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.out.printf("Usage: CreateSequenceFile <input dir> <output dir>\n"); return -1; } Job job = new Job(getConf()); job.setJarByClass(CreateSequenceFile.class); job.setJobName("Create Sequence File"); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); FileOutputFormat.setCompressOutput(job,true); FileOutputFormat.setOutputCompressorClass(job,SnappyCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }