private List<Writable> getSeqRecords(Path dir, Text key) throws Exception { SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders( getConf(), dir); ArrayList<Writable> res = new ArrayList<Writable>(); Class<?> keyClass = readers[0].getKeyClass(); Class<?> valueClass = readers[0].getValueClass(); if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) throw new IOException("Incompatible key (" + keyClass.getName() + ")"); Writable aKey = (Writable) keyClass.newInstance(); Writable value = (Writable) valueClass.newInstance(); for (int i = 0; i < readers.length; i++) { while (readers[i].next(aKey, value)) { if (aKey.equals(key)) { res.add(value); value = (Writable) valueClass.newInstance(); } } readers[i].close(); } return res; }
@Override public void run(String[] args) throws Exception { Flags flags = new Flags(); flags.addWithDefaultValue( "tag_subject_data", "/media/work/datasets(secret)/douban/raw/tag_subject.dat", ""); flags.addWithDefaultValue( "subject_data", "/media/work/datasets(secret)/douban/raw/subject.dat", ""); flags.add("output"); flags.parseAndCheck(args); JobConf job = new JobConf(this.getClass()); job.setJobName("convert-douban-raw-to-posts"); MapReduceHelper.setAllOutputTypes(job, Text.class); MapReduceHelper.setMR( job, DoubanRawMapper.class, DoubanToPostReducer.class); job.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath( job, new Path(flags.getString("tag_subject_data"))); TextInputFormat.addInputPath( job, new Path(flags.getString("subject_data"))); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath( job, new Path(flags.getString("output"))); JobClient.runJob(job); }
private JobConf createJobConf() { JobConf jobConf = new JobConf(getConf()); String jobName = NAME + " " + dateForm.format(new Date(System.currentTimeMillis())); jobConf.setJobName(jobName); jobConf.setMapSpeculativeExecution(false); jobConf.setJarByClass(DataFsck.class); jobConf.setInputFormat(DataFsckInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setMapperClass(DataFsckMapper.class); jobConf.setNumReduceTasks(0); return jobConf; }
private JobConf createJobConf(Configuration conf) { JobConf jobConf = new JobConf(conf); String jobName = NAME + "_" + dateForm.format(new Date(System.currentTimeMillis())); jobConf.setJobName(jobName); jobConf.setMapSpeculativeExecution(false); jobConf.setJarByClass(FastFileCheck.class); jobConf.setInputFormat(FileCheckInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setMapperClass(FileCheckMapper.class); jobConf.setNumReduceTasks(0); jobConf.setBoolean(SOURCE_ONLY_CONF, sourceOnly); return jobConf; }
private List<Writable> getSeqRecords(Path dir, Text key) throws Exception { SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir); ArrayList<Writable> res = new ArrayList<Writable>(); Class keyClass = readers[0].getKeyClass(); Class valueClass = readers[0].getValueClass(); if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) throw new IOException("Incompatible key (" + keyClass.getName() + ")"); Writable aKey = (Writable)keyClass.newInstance(); Writable value = (Writable)valueClass.newInstance(); for (int i = 0; i < readers.length; i++) { while (readers[i].next(aKey, value)) { if (aKey.equals(key)) { res.add(value); value = (Writable)valueClass.newInstance(); } } readers[i].close(); } return res; }
public static void IDMappingJob(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJarByClass(HybridDriver.class); job.setJobName("Converting binary similarity scores to text"); job.setMapperClass(IDMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); Path inputPath = new Path(OUTPUT_DIR); job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(job, inputPath); Path outputPath = new Path("SimilarityScores"); job.setOutputFormat(TextOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); FileSystem.get(job).delete(outputPath, true); HashPagesDriver.prepareDistribCache(job, HashPagesDriver.IDS_FILE2); //remove not sure JobSubmitter.run(job,"BINARY TO TEXT",job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE)); }
/** * Runs a MR job with maps only to convert input directory of numeric valued * records to hadoop sequence format. It assumes a text input of format of * [id feature weight ..] to be the format of input. */ public static void writeSequence() throws IOException { JobConf job = new JobConf(); job.setJobName("Convert text vectors to hadoop seqeunce "); job.setJarByClass(SeqWriter.class); job.setMapperClass(SeqMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(FeatureWeightArrayWritable.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); job.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path(INPUT_DIR)); FileSystem.get(job).delete(new Path(HashPagesDriver.IDS_FILE2), true); Path outputPath = new Path(OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); JobSubmitter.run(job,"PREPROCESS",-1); }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]"); } String master = args[0]; String fileName = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS")); List<Tuple2<String, Integer>> input = new ArrayList(); input.add(new Tuple2("coffee", 1)); input.add(new Tuple2("coffee", 2)); input.add(new Tuple2("pandas", 3)); JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input); JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes()); result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class); }
public long produceSamples(Path samplePath) throws Exception { Path input = new Path(samplePath.toString()+"-seeds"); this.numSamples = writeSeeds(input); LOG.info("Generating "+this.numSamples+" of samples"); JobConf jobConf = getJobConf(); jobConf.set("genkmeansdataset.dimensions",Integer.toString(dimension)); FileInputFormat.setInputPaths(jobConf, input); FileOutputFormat.setOutputPath(jobConf, samplePath); jobConf.setMapperClass(MapClass.class); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(VectorWritable.class); jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); return this.numSamples; }
/** * Create a job configuration */ @SuppressWarnings("rawtypes") public static JobConf createJobConf(String name, String topic, Props props, Class classobj) throws Exception { JobConf conf = getJobConf(name, props, classobj); conf.set("topic", topic); // input format conf.setInputFormat(KafkaETLInputFormat.class); //turn off mapper speculative execution conf.setMapSpeculativeExecution(false); // setup multiple outputs MultipleOutputs.addMultiNamedOutput(conf, "offsets", SequenceFileOutputFormat.class, KafkaETLKey.class, BytesWritable.class); return conf; }
int runCreateJob(String inputPathString, String outputPathString, String jobName) throws IOException { /* 134 */ JobConf jobConf = new JobConf(this.conf); /* 135 */ jobConf.setJobName(jobName); /* 136 */ jobConf.setMapSpeculativeExecution(false); /* */ /* 138 */ FileInputFormat.addInputPath(jobConf, new Path(inputPathString)); /* 139 */ FileOutputFormat.setOutputPath(jobConf, new Path(outputPathString)); /* */ /* 141 */ jobConf.setInputFormat(SequenceFileInputFormat.class); /* 142 */ jobConf.setOutputKeyClass(LongWritable.class); /* 143 */ jobConf.setOutputValueClass(CreateFileInfo.class); /* 144 */ jobConf.setMapperClass(CreateFileMapper.class); /* 145 */ jobConf.setReducerClass(IdentityReducer.class); /* 146 */ jobConf.setOutputFormat(SequenceFileOutputFormat.class); /* */ /* 148 */ RunningJob result = JobClient.runJob(jobConf); /* 149 */ return result.isSuccessful() ? 0 : -1; /* */ }
@Override protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job, String name, Progressable arg3) throws IOException { if (theSequenceFileOutputFormat == null) { theSequenceFileOutputFormat = new SequenceFileOutputFormat<K,V>(); } return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3); }
private void createBayesData() throws IOException, URISyntaxException { log.info("creating bayes text data ... "); JobConf job = new JobConf(); Path fout = options.getResultPath(); Utils.checkHdfsPath(fout); String jobname = "Create bayes data"; job.setJobName(jobname); Utils.shareDict(options, job); setBayesOptions(job); FileInputFormat.setInputPaths(job, dummy.getPath()); job.setInputFormat(NLineInputFormat.class); job.setJarByClass(CreateBayesPages.class); job.setMapperClass(CreateBayesPages.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, fout); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); log.info("Running Job: " +jobname); log.info("Pages file " + dummy.getPath() + " as input"); log.info("Rankings file " + fout + " as output"); JobClient.runJob(job); log.info("Finished Running Job: " + jobname); }
/** * Runs the inverter job. The inverter job flips outlinks to inlinks to be * passed into the analysis job. * * @param nodeDb * The node database to use. * @param outlinkDb * The outlink database to use. * @param output * The output directory. * * @throws IOException * If an error occurs while running the inverter job. */ private void runInverter(Path nodeDb, Path outlinkDb, Path output) throws IOException { // configure the inverter JobConf inverter = new NutchJob(getConf()); inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); FileOutputFormat.setOutputPath(inverter, output); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); inverter.setReducerClass(Inverter.class); inverter.setMapOutputKeyClass(Text.class); inverter.setMapOutputValueClass(ObjectWritable.class); inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkDatum.class); inverter.setOutputFormat(SequenceFileOutputFormat.class); inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the inverter job LOG.info("Starting inverter job"); try { JobClient.runJob(inverter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished inverter job."); }
/** * Extracts redirects and the target for each. * * @param inputPath * @param outputPath * @throws IOException */ private void task0(String inputPath, String outputPath) throws IOException { LOG.info("Extracting redirects (phase 0)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName(String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper0.class); conf.setReducerClass(IdentityReducer.class); JobClient.runJob(conf); }
/** * Maps from Wikipedia article to (srcID, (targetID, anchor). * * @param inputPath * @param outputPath * @throws IOException */ private void task1(String inputPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 1)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName(String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath)); // 10 reducers is reasonable. conf.setNumReduceTasks(10); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfStringInt.class); conf.setMapOutputValueClass(PairOfStrings.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfStrings.class); conf.setMapperClass(MyMapper1.class); conf.setReducerClass(MyReducer1.class); conf.setPartitionerClass(MyPartitioner1.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); }
/** * Write a list of PDB ids to a hadoop sequence file in MMTF format. * @param pdbCodeList the input list of PDB ids */ public static void writeToFile(List<String> pdbCodeList, String uri, String producer) { JavaSparkContext javaSparkContext = SparkUtils.getSparkContext(); MmtfUtils.setUpBioJava(); JavaPairRDD<Text, BytesWritable> distData = javaSparkContext.parallelize(pdbCodeList) .mapToPair(new PdbIdToMmtf(producer)) .mapToPair(t -> new Tuple2<String, byte[]>(t._1, WriterUtils.gzipCompress(t._2))) .mapToPair(new StringByteToTextByteWriter()); distData.saveAsHadoopFile(uri, Text.class, BytesWritable.class, SequenceFileOutputFormat.class); javaSparkContext.close(); }
public TextMapReduceJobConf() { super(); this.setInputFormat(SequenceFileInputFormat.class); this.setOutputFormat(SequenceFileOutputFormat.class); this.setMapOutputKeyClass(Text.class); this.setMapOutputValueClass(Text.class); this.setOutputKeyClass(Text.class); this.setOutputValueClass(Text.class); }
public TextMapReduceJobConf(Class jobClass) { super(jobClass); this.setInputFormat(SequenceFileInputFormat.class); this.setOutputFormat(SequenceFileOutputFormat.class); this.setMapOutputKeyClass(Text.class); this.setMapOutputValueClass(Text.class); this.setOutputKeyClass(Text.class); this.setOutputValueClass(Text.class); }
public static void SetSeqFileInputOutput(JobConf job, String inputPaths, Path output) throws IOException { job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, output); // Expand input pattern. FileSystem fs = FileSystem.get(job); String[] paths = inputPaths.split(","); for (String p : paths) { int lastslash = p.lastIndexOf("/"); if (lastslash < 0) { p = "./" + p; lastslash = 1; } String parent = p.substring(0, lastslash); p = p.substring(lastslash + 1); // Each path is treated as a pattern. p = p.replace("\\", "\\\\"); p = p.replace(".", "\\."); p = p.replace("*", ".*"); p = p.replace("?", "."); LOG.info("Use pattern:" + p); Pattern re = Pattern.compile(p); // List all files. FileStatus[] files = fs.listStatus(new Path(parent)); for (FileStatus f : files) { if (re.matcher(f.getPath().getName()).matches()) { SequenceFileInputFormat.addInputPath(job, f.getPath()); LOG.info("Adding input:" + f.getPath()); } } } }
List<SequenceFile.Reader> getOutputs(List<JobContext> submitted) throws IOException { List<SequenceFile.Reader> outputs = new ArrayList<SequenceFile.Reader>(); for (JobContext ctx: submitted) { SequenceFile.Reader[] jobOutputs = SequenceFileOutputFormat.getReaders( getConf(), SequenceFileOutputFormat.getOutputPath(ctx.jobConf)); for (SequenceFile.Reader r: jobOutputs) { outputs.add(r); } } return outputs; }
@SuppressWarnings("unused") public static void main(String[] args) throws IOException { JobConf conf = new JobConf(EdgelistPartitioner.class); if (conf == null) { return; } String dir1 = "/user/miyuru/merged"; String dir2 = "/user/miyuru/merged-out"; // We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); // only delete dir2 because dir1 is uploaded externally. if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); } conf.setInputFormat(WholeFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); WholeFileInputFormat.setInputPaths(conf, new Path(dir1)); SequenceFileOutputFormat.setOutputPath(conf, new Path(dir2)); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(SequenceFileMapper.class); conf.setReducerClass(MultipleOutputsInvertedReducer.class); conf.setOutputFormat(NullOutputFormat.class); conf.setJobName("EdgelistPartitioner"); MultipleOutputs.addMultiNamedOutput(conf, "partition", TextOutputFormat.class, NullWritable.class, Text.class); JobClient.runJob(conf); }
@Parameterized.Parameters(name = "Executing: {0} {1}") public static Collection<Object[]> parameters() { return Arrays.asList( new Object[]{TextOutputFormat.class, TextInputFormat.class}, new Object[]{SequenceFileOutputFormat.class, SequenceFileInputFormat.class} ); }
/** * Runs the inverter job. The inverter job flips outlinks to inlinks to be * passed into the analysis job. * * The inverter job takes a link loops database if it exists. It is an * optional componenet of link analysis due to its extreme computational and * space requirements but it can be very useful is weeding out and eliminating * link farms and other spam pages. * * @param nodeDb The node database to use. * @param outlinkDb The outlink database to use. * @param loopDb The loop database to use if it exists. * @param output The output directory. * * @throws IOException If an error occurs while running the inverter job. */ private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException { // configure the inverter JobConf inverter = new NutchJob(getConf()); inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); // add the loop database if it exists, isn't null if (loopDb != null) { FileInputFormat.addInputPath(inverter, loopDb); } FileOutputFormat.setOutputPath(inverter, output); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); inverter.setReducerClass(Inverter.class); inverter.setMapOutputKeyClass(Text.class); inverter.setMapOutputValueClass(ObjectWritable.class); inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkDatum.class); inverter.setOutputFormat(SequenceFileOutputFormat.class); inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the inverter job LOG.info("Starting inverter job"); try { JobClient.runJob(inverter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished inverter job."); }
/** * Sets the job configurations including the mapper and reducer classes to * do the sorting based on vector lengths. */ public static void main(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJobName(LengthSortMain.class.getSimpleName()); job.setJarByClass(LengthSortMain.class); job.setMapperClass(LengthSortMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class); job.setPartitionerClass(LengthRangePartitioner.class); job.setReducerClass(LengthSortReducer.class); job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); // // set input & output // String inputDir = SortDriver.INPUT_DIR; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input path not set"); } job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path(SortDriver.OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // // run // JobSubmitter.run(job, "Sort By Vector Lenghts",-1); }
/** * Main method sets the job configurations including the mapper and reducer * classes to do the sorting. Some of the produced partitions might be * merged later to reflect the number of partitions chosen by the user. */ public static void main(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJobName("NormSort"); job.setJarByClass(NormSortMain.class); job.setMapperClass(NormSortMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class); job.setPartitionerClass(NormRangePartitioner.class); job.setReducerClass(NormSortReducer.class); job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); // // set input & output // String inputDir = SortDriver.INPUT_DIR; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input path not set"); } job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path(SortDriver.OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // // run // JobSubmitter.run(job,"Sort By p-norm",-1); }
/** * Sets the job configurations including the mapper and reducer classes to * do the sorting based signatures. */ public static void main(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJobName(SigSortMain.class.getSimpleName()); job.setJarByClass(SigSortMain.class); job.setMapperClass(SigSortMapper.class); job.setMapOutputKeyClass(BitSignature.class); job.setMapOutputValueClass(LongWritable.class); job.setPartitionerClass(SigRangePartitioner.class); job.setReducerClass(SigSortReducer.class); job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BitSignature.class); // // set input & output // String inputDir = SortDriver.INPUT_DIR; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input path not set"); } job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path(OUTPUT_PATH); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // // run // JobSubmitter.run(job,"Sort By Signature Bytes",-1); }
/** * Main method sets the job configurations including the mapper and reducer * classes to do the sorting. */ public static void main(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); // ToolRunner.printGenericCommandUsage(System.out); job.setJobName(MaxwSortMain.class.getSimpleName()); job.setJarByClass(MaxwSortMain.class); job.setMapperClass(MaxwSortMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class); job.setPartitionerClass(MaxwRangePartitioner.class); job.setReducerClass(MaxwSortReducer.class); job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); // // set input & output // String inputDir = SortDriver.INPUT_DIR; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input path not set"); } job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path(SortDriver.OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // // run // JobSubmitter.run(job,"Sort By infinity-Norm",-1); }
public static void main(String[] args) throws Exception { JobConf job = new JobConf(SignaturesGenerator.class); new GenericOptionsParser(job, args); job.setJobName(SignaturesGenerator.class.getSimpleName()); int nBits = job.getInt(ProjectionLshDriver.LSH_NBITS_PROPERTY, ProjectionLshDriver.LSH_NBITS_VALUE); setParameters(); FileSystem fs = FileSystem.get(job); prepareDistributedCache(job, fs, new Path(ProjectionsGenerator.OUTPUT_DIR)); Path outputPath = new Path(OUTPUT_DIR); if (fs.exists(outputPath)) fs.delete(outputPath); FileInputFormat.setInputPaths(job, INPUT_DIR); // Path(INPUT_DIR)); FileOutputFormat.setOutputPath(job, outputPath); // FileOutputFormat.setCompressOutput(job, false); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.set("mapred.child.java.opts", "-Xmx2048m"); job.setInt("mapred.map.max.attempts", 10); job.setInt("mapred.reduce.max.attempts", 10); job.setInt("mapred.task.timeout", 6000000); job.setMapperClass(SigMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BitSignature.class); job.setNumReduceTasks(0); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BitSignature.class); JobSubmitter.run(job,"LSH",-1); }