private boolean genBigItemMap(String input, String output) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(this.getConf(), "Computing items remapping for " + this.input); job.setJarByClass(TopPIoverHadoop.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setMapperClass(InverseMapper.class); job.setReducerClass(ItemBigRebasingReducer.class); job.setNumReduceTasks(1); return job.waitForCompletion(true); }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.out.printf("Usage: WordCo <input dir> <output dir>\n"); return -1; } Job job = new Job(getConf()); job.setJarByClass(WordCoAscendingFrequency.class); job.setJobName("Word Co-Occurrence String Pair"); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(SequenceFileInputFormat.class); //job.setMapOutputKeyClass(IntWritable.class); //job.setMapOutputValueClass(StringPairWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(StringPairWritable.class); job.setMapperClass(InverseMapper.class); //job.setReducerClass(SumReducer.class);//identity reducer boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.out.printf("Usage: WordCo <input dir> <output dir>\n"); return -1; } Job job = new Job(getConf()); job.setJarByClass(WordCoDescendingFrequency.class); job.setJobName("Word Co-Occurrence String Pair"); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(SequenceFileInputFormat.class); //job.setMapOutputKeyClass(IntWritable.class); //job.setMapOutputValueClass(StringPairWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(StringPairWritable.class); job.setMapperClass(InverseMapper.class); //jobsetSortComparatorClass(LongWritable.DecreasingComparator.class); job.setSortComparatorClass(IntDescendingComparator.class); //job.setReducerClass(SumReducer.class);//identity reducer boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
private void testComparator(String keySpec, int expect) throws Exception { String root = System.getProperty("test.build.data", "/tmp"); Path inDir = new Path(root, "test_cmp/in"); Path outDir = new Path(root, "test_cmp/out"); conf.set("mapreduce.partition.keycomparator.options", keySpec); conf.set("mapreduce.partition.keypartitioner.options", "-k1.1,1.1"); conf.set(MRJobConfig.MAP_OUTPUT_KEY_FIELD_SEPERATOR, " "); Job job = MapReduceTestUtil.createJob(conf, inDir, outDir, 1, 1, line1 +"\n" + line2 + "\n"); job.setMapperClass(InverseMapper.class); job.setReducerClass(Reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setSortComparatorClass(KeyFieldBasedComparator.class); job.setPartitionerClass(KeyFieldBasedPartitioner.class); job.waitForCompletion(true); assertTrue(job.isSuccessful()); // validate output Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(outDir, new Utils.OutputFileUtils.OutputFilesFilter())); if (outputFiles.length > 0) { InputStream is = getFileSystem().open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line = reader.readLine(); //make sure we get what we expect as the first line, and also //that we have two lines (both the lines must end up in the same //reducer since the partitioner takes the same key spec for all //lines if (expect == 1) { assertTrue(line.startsWith(line1)); } else if (expect == 2) { assertTrue(line.startsWith(line2)); } line = reader.readLine(); if (expect == 1) { assertTrue(line.startsWith(line2)); } else if (expect == 2) { assertTrue(line.startsWith(line1)); } reader.close(); } }
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return 2; } Path tempDir = new Path("grep-temp-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) conf.set(RegexMapper.GROUP, args[3]); Job grepJob = Job.getInstance(conf); try { grepJob.setJobName("grep-search"); grepJob.setJarByClass(Grep.class); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); grepJob.waitForCompletion(true); Job sortJob = Job.getInstance(conf); sortJob.setJobName("grep-sort"); sortJob.setJarByClass(Grep.class); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return 2; } Path tempDir = new Path("grep-temp-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) conf.set(RegexMapper.GROUP, args[3]); Job grepJob = new Job(conf); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); grepJob.waitForCompletion(true); Job sortJob = new Job(conf); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }
private void testComparator(String keySpec, int expect) throws Exception { String root = System.getProperty("test.build.data", "/tmp"); Path inDir = new Path(root, "test_cmp/in"); Path outDir = new Path(root, "test_cmp/out"); conf.set("mapreduce.partition.keycomparator.options", keySpec); conf.set("mapreduce.partition.keypartitioner.options", "-k1.1,1.1"); conf.set(MRJobConfig.MAP_OUTPUT_KEY_FIELD_SEPERATOR, " "); Job job = MapReduceTestUtil.createJob(conf, inDir, outDir, 1, 2, line1 +"\n" + line2 + "\n"); job.setMapperClass(InverseMapper.class); job.setReducerClass(Reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setSortComparatorClass(KeyFieldBasedComparator.class); job.setPartitionerClass(KeyFieldBasedPartitioner.class); job.waitForCompletion(true); assertTrue(job.isSuccessful()); // validate output Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(outDir, new Utils.OutputFileUtils.OutputFilesFilter())); if (outputFiles.length > 0) { InputStream is = getFileSystem().open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line = reader.readLine(); //make sure we get what we expect as the first line, and also //that we have two lines (both the lines must end up in the same //reducer since the partitioner takes the same key spec for all //lines if (expect == 1) { assertTrue(line.startsWith(line1)); } else if (expect == 2) { assertTrue(line.startsWith(line2)); } line = reader.readLine(); if (expect == 1) { assertTrue(line.startsWith(line2)); } else if (expect == 2) { assertTrue(line.startsWith(line1)); } reader.close(); } }
@SuppressWarnings("deprecation") public int run(String[] args) throws Exception { long random = new Random().nextLong(); log.info("random -> " + random); // 第三个参数为抓取的单词目标 args = new String[] { String.format(ConfigUtils.HDFS.WORDCOUNT_IN, "word.txt"), String.format(ConfigUtils.HDFS.WORDCOUNT_OUT, random),"d" }; Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) conf.set(RegexMapper.GROUP, args[3]); Job grepJob = new Job(conf); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); grepJob.waitForCompletion(true); Job sortJob = new Job(conf); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }
@SuppressWarnings("rawtypes") @Override protected Class<? extends Mapper> getMapperClass() { return InverseMapper.class; }
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return 2; } Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) conf.set(RegexMapper.GROUP, args[3]); Job grepJob = new Job(conf); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); grepJob.waitForCompletion(true); Job sortJob = new Job(conf); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); org.apache.hadoop.util.Tool t; return 2; } Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) conf.set(RegexMapper.GROUP, args[3]); Job grepJob = Job.getInstance(conf); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); grepJob.waitForCompletion(true); Job sortJob = Job.getInstance(conf); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }