public void testAddInputPathWithMapper() { final JobConf conf = new JobConf(); MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class, MapClass.class); MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class, MapClass2.class); final Map<Path, InputFormat> inputs = MultipleInputs .getInputFormatMap(conf); final Map<Path, Class<? extends Mapper>> maps = MultipleInputs .getMapperTypeMap(conf); assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass()); assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")) .getClass()); assertEquals(MapClass.class, maps.get(new Path("/foo"))); assertEquals(MapClass2.class, maps.get(new Path("/bar"))); }
public void configure(JobConf job) { super.configure(job); //disable the auto increment of the counter. For streaming, no of //processed records could be different(equal or less) than the no of //records input. SkipBadRecords.setAutoIncrMapperProcCount(job, false); skipping = job.getBoolean(MRJobConfig.SKIP_RECORDS, false); if (mapInputWriterClass_.getCanonicalName().equals(TextInputWriter.class.getCanonicalName())) { String inputFormatClassName = job.getClass("mapred.input.format.class", TextInputFormat.class).getCanonicalName(); ignoreKey = job.getBoolean("stream.map.input.ignoreKey", inputFormatClassName.equals(TextInputFormat.class.getCanonicalName())); } try { mapOutputFieldSeparator = job.get("stream.map.output.field.separator", "\t").getBytes("UTF-8"); mapInputFieldSeparator = job.get("stream.map.input.field.separator", "\t").getBytes("UTF-8"); numOfMapOutputKeyFields = job.getInt("stream.num.map.output.key.fields", 1); } catch (UnsupportedEncodingException e) { throw new RuntimeException("The current system does not support UTF-8 encoding!", e); } }
private static void fillMap(JetInstance client, String name, String inputPath, int parallelism) throws Exception { DAG dag = new DAG(); JobConf conf = new JobConf(); conf.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath(conf, new Path(inputPath)); Vertex reader = dag.newVertex("reader", readHdfsP(conf, Util::entry)); Vertex mapper = dag.newVertex("mapper", mapP((Map.Entry<LongWritable, Text> e) -> entry(e.getKey().get(), e.getValue().toString()))); Vertex writer = dag.newVertex("writer", writeMapP(name)); reader.localParallelism(parallelism); mapper.localParallelism(parallelism); writer.localParallelism(parallelism); dag.edge(between(reader, mapper)); dag.edge(between(mapper, writer)); JobConfig jobConfig = new JobConfig(); jobConfig.addClass(HdfsToMap.class); client.newJob(dag, jobConfig).join(); }
/** * @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job) */ public static void addDependencyJars(JobConf job) throws IOException { org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job); org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJars( job, // when making changes here, consider also mapreduce.TableMapReduceUtil // pull job classes job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getOutputKeyClass(), job.getOutputValueClass(), job.getPartitionerClass(), job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class), job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class), job.getCombinerClass()); }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WeatherData.class); conf.setJobName("temp"); // Note:- As Mapper's output types are not default so we have to define // the // following properties. conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(MaxTemperatureMapper.class); conf.setReducerClass(MaxTemperatureReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
@Override public void run(String[] args) throws Exception { Flags flags = new Flags(); flags.addWithDefaultValue( "tag_subject_data", "/media/work/datasets(secret)/douban/raw/tag_subject.dat", ""); flags.addWithDefaultValue( "subject_data", "/media/work/datasets(secret)/douban/raw/subject.dat", ""); flags.add("output"); flags.parseAndCheck(args); JobConf job = new JobConf(this.getClass()); job.setJobName("convert-douban-raw-to-posts"); MapReduceHelper.setAllOutputTypes(job, Text.class); MapReduceHelper.setMR( job, DoubanRawMapper.class, DoubanToPostReducer.class); job.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath( job, new Path(flags.getString("tag_subject_data"))); TextInputFormat.addInputPath( job, new Path(flags.getString("subject_data"))); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath( job, new Path(flags.getString("output"))); JobClient.runJob(job); }
public void configure(JobConf job) { // Set the mapper and reducers job.setMapperClass(ReadDataJob.TestMapper.class); // Make sure this jar is included job.setJarByClass(ReadDataJob.TestMapper.class); // Specify the input and output data formats job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); // Turn off speculative execution job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); // Add the job input path FileInputFormat.addInputPath(job, new Path(this.input_path)); }
public void configure(JobConf job) { // Set the mapper and reducers job.setMapperClass(TestMapper.class); // job.setReducerClass(TestReducer.class); // Set the output types of the mapper and reducer // job.setMapOutputKeyClass(IntWritable.class); // job.setMapOutputValueClass(NullWritable.class); // job.setOutputKeyClass(NullWritable.class); // job.setOutputValueClass(NullWritable.class); // Make sure this jar is included job.setJarByClass(TestMapper.class); // Specify the input and output data formats job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); // Turn off speculative execution job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); // Add the job input path FileInputFormat.addInputPath(job, new Path(this.input_filename)); }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCountOldAPI.class); conf.setJobName("old wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
public void configure(JobConf job) { super.configure(job); //disable the auto increment of the counter. For streaming, no of //processed records could be different(equal or less) than the no of //records input. SkipBadRecords.setAutoIncrMapperProcCount(job, false); skipping = job.getBoolean("mapred.skip.on", false); if (mapInputWriterClass_.getCanonicalName().equals(TextInputWriter.class.getCanonicalName())) { String inputFormatClassName = job.getClass("mapred.input.format.class", TextInputFormat.class).getCanonicalName(); ignoreKey = job.getBoolean("stream.map.input.ignoreKey", inputFormatClassName.equals(TextInputFormat.class.getCanonicalName())); } try { mapOutputFieldSeparator = job.get("stream.map.output.field.separator", "\t").getBytes("UTF-8"); mapInputFieldSeparator = job.get("stream.map.input.field.separator", "\t").getBytes("UTF-8"); numOfMapOutputKeyFields = job.getInt("stream.num.map.output.key.fields", 1); } catch (UnsupportedEncodingException e) { throw new RuntimeException("The current system does not support UTF-8 encoding!", e); } }
private static JobConf createJobConf(Configuration conf) throws IOException { JobConf jobConf = new JobConf(conf); String jobName = "transaction_generator"; jobConf.setJobName(jobName); String splitDir = workplace + "split/"; jobConf.set(TEST_DIR_LABEL, workplace); jobConf.setMapSpeculativeExecution(false); jobConf.setJarByClass(TxnGenerator.class); jobConf.setMapperClass(GeneratorMapper.class); jobConf.setInputFormat(TextInputFormat.class); FileInputFormat.addInputPath(jobConf, new Path(splitDir)); Random random = new Random(); FileOutputFormat.setOutputPath(jobConf, new Path(workplace, "output" + random.nextLong())); jobConf.setNumReduceTasks(0); jobConf.setNumMapTasks(numMappers); createSplitFiles(conf, new Path(splitDir)); return jobConf; }
@SuppressWarnings("unchecked") public void configure(JobConf job) { super.configure(job); //disable the auto increment of the counter. For streaming, no of //processed records could be different(equal or less) than the no of //records input. SkipBadRecords.setAutoIncrMapperProcCount(job, false); skipping = job.getBoolean("mapred.skip.on", false); String inputFormatClassName = job.getClass("mapred.input.format.class", TextInputFormat.class).getCanonicalName(); ignoreKey = ignoreKey || inputFormatClassName.equals(TextInputFormat.class.getCanonicalName()); try { mapOutputFieldSeparator = job.get("stream.map.output.field.separator", "\t").getBytes("UTF-8"); mapInputFieldSeparator = job.get("stream.map.input.field.separator", "\t").getBytes("UTF-8"); numOfMapOutputKeyFields = job.getInt("stream.num.map.output.key.fields", 1); } catch (UnsupportedEncodingException e) { throw new RuntimeException("The current system does not support UTF-8 encoding!", e); } }
public void run(String[] args) throws Exception { JobConf conf = new JobConf(this.getClass()); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
public CSVReadTask(InputSplit split, SplitOffsetInfos offsets, TextInputFormat informat, JobConf job, MatrixBlock dest, long rlen, long clen, boolean hasHeader, String delim, boolean fill, double fillValue, int splitCount) { _split = split; _splitoffsets = offsets; // new SplitOffsetInfos(offsets); _sparse = dest.isInSparseFormat(); _informat = informat; _job = job; _dest = dest; _rlen = rlen; _clen = clen; _isFirstSplit = (splitCount == 0); _hasHeader = hasHeader; _fill = fill; _fillValue = fillValue; _delim = delim; _rc = true; _splitCount = splitCount; }
@Test public void testAddInputPathWithMapper() { final JobConf conf = new JobConf(); MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class, MapClass.class); MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class, MapClass2.class); final Map<Path, InputFormat> inputs = MultipleInputs .getInputFormatMap(conf); final Map<Path, Class<? extends Mapper>> maps = MultipleInputs .getMapperTypeMap(conf); assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass()); assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")) .getClass()); assertEquals(MapClass.class, maps.get(new Path("/foo"))); assertEquals(MapClass2.class, maps.get(new Path("/bar"))); }
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException { if( fs.isDirectory(path) ) { FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); for(InputSplit split: splits) readTextCellFrameFromInputSplit(split, informat, job, dest); } else { readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen); } }
static InputFormat<?, ?> getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget) { String inputFormatName = getInputFormatName(schema); try { JobConf jobConf = new JobConf(configuration); Class<? extends InputFormat<?, ?>> inputFormatClass = getInputFormatClass(jobConf, inputFormatName); if (symlinkTarget && (inputFormatClass == SymlinkTextInputFormat.class)) { // symlink targets are always TextInputFormat inputFormatClass = TextInputFormat.class; } return ReflectionUtils.newInstance(inputFormatClass, jobConf); } catch (ClassNotFoundException | RuntimeException e) { throw new RuntimeException("Unable to create input format " + inputFormatName, e); } }
public static Table createUnpartitionedTable( HiveMetaStoreClient metaStoreClient, String database, String table, URI location) throws TException { Table hiveTable = new Table(); hiveTable.setDbName(database); hiveTable.setTableName(table); hiveTable.setTableType(TableType.EXTERNAL_TABLE.name()); hiveTable.putToParameters("EXTERNAL", "TRUE"); StorageDescriptor sd = new StorageDescriptor(); sd.setCols(DATA_COLUMNS); sd.setLocation(location.toString()); sd.setParameters(new HashMap<String, String>()); sd.setInputFormat(TextInputFormat.class.getName()); sd.setOutputFormat(TextOutputFormat.class.getName()); sd.setSerdeInfo(new SerDeInfo()); sd.getSerdeInfo().setSerializationLib("org.apache.hadoop.hive.serde2.OpenCSVSerde"); hiveTable.setSd(sd); metaStoreClient.createTable(hiveTable); ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, database, table); ColumnStatisticsData statsData = new ColumnStatisticsData(_Fields.LONG_STATS, new LongColumnStatsData(1L, 2L)); ColumnStatisticsObj cso1 = new ColumnStatisticsObj("id", "bigint", statsData); List<ColumnStatisticsObj> statsObj = Collections.singletonList(cso1); metaStoreClient.updateTableColumnStatistics(new ColumnStatistics(statsDesc, statsObj)); return hiveTable; }
public static Table createPartitionedTable( HiveMetaStoreClient metaStoreClient, String database, String table, URI location) throws Exception { Table hiveTable = new Table(); hiveTable.setDbName(database); hiveTable.setTableName(table); hiveTable.setTableType(TableType.EXTERNAL_TABLE.name()); hiveTable.putToParameters("EXTERNAL", "TRUE"); hiveTable.setPartitionKeys(PARTITION_COLUMNS); StorageDescriptor sd = new StorageDescriptor(); sd.setCols(DATA_COLUMNS); sd.setLocation(location.toString()); sd.setParameters(new HashMap<String, String>()); sd.setInputFormat(TextInputFormat.class.getName()); sd.setOutputFormat(TextOutputFormat.class.getName()); sd.setSerdeInfo(new SerDeInfo()); sd.getSerdeInfo().setSerializationLib("org.apache.hadoop.hive.serde2.OpenCSVSerde"); hiveTable.setSd(sd); metaStoreClient.createTable(hiveTable); ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, database, table); ColumnStatisticsData statsData = new ColumnStatisticsData(_Fields.LONG_STATS, new LongColumnStatsData(1L, 2L)); ColumnStatisticsObj cso1 = new ColumnStatisticsObj("id", "bigint", statsData); List<ColumnStatisticsObj> statsObj = Collections.singletonList(cso1); metaStoreClient.updateTableColumnStatistics(new ColumnStatistics(statsDesc, statsObj)); return hiveTable; }
public void testAddInputPathWithFormat() { final JobConf conf = new JobConf(); MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class); MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class); final Map<Path, InputFormat> inputs = MultipleInputs .getInputFormatMap(conf); assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass()); assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")) .getClass()); }
public void configure(String keySpec, int expect) throws Exception { Path testdir = new Path(TEST_DIR.getAbsolutePath()); Path inDir = new Path(testdir, "in"); Path outDir = new Path(testdir, "out"); FileSystem fs = getFileSystem(); fs.delete(testdir, true); conf.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(LongWritable.class); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class); conf.setKeyFieldComparatorOptions(keySpec); conf.setKeyFieldPartitionerOptions("-k1.1,1.1"); conf.set(JobContext.MAP_OUTPUT_KEY_FIELD_SEPERATOR, " "); conf.setMapperClass(InverseMapper.class); conf.setReducerClass(IdentityReducer.class); if (!fs.mkdirs(testdir)) { throw new IOException("Mkdirs failed to create " + testdir.toString()); } if (!fs.mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.toString()); } // set up input data in 2 files Path inFile = new Path(inDir, "part0"); FileOutputStream fos = new FileOutputStream(inFile.toString()); fos.write((line1 + "\n").getBytes()); fos.write((line2 + "\n").getBytes()); fos.close(); JobClient jc = new JobClient(conf); RunningJob r_job = jc.submitJob(conf); while (!r_job.isComplete()) { Thread.sleep(1000); } if (!r_job.isSuccessful()) { fail("Oops! The job broke due to an unexpected error"); } Path[] outputFiles = FileUtil.stat2Paths( getFileSystem().listStatus(outDir, new Utils.OutputFileUtils.OutputFilesFilter())); if (outputFiles.length > 0) { InputStream is = getFileSystem().open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line = reader.readLine(); //make sure we get what we expect as the first line, and also //that we have two lines if (expect == 1) { assertTrue(line.startsWith(line1)); } else if (expect == 2) { assertTrue(line.startsWith(line2)); } line = reader.readLine(); if (expect == 1) { assertTrue(line.startsWith(line2)); } else if (expect == 2) { assertTrue(line.startsWith(line1)); } reader.close(); } }
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, int numReds) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(conf); if (fs.exists(outDir)) { fs.delete(outDir, true); } if (!fs.exists(inDir)) { fs.mkdirs(inDir); } String input = "The quick brown fox\n" + "has many silly\n" + "red fox sox\n"; for (int i = 0; i < numMaps; ++i) { DataOutputStream file = fs.create(new Path(inDir, "part-" + i)); file.writeBytes(input); file.close(); } DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs); conf.setOutputCommitter(CustomOutputCommitter.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); JobClient jobClient = new JobClient(conf); RunningJob job = jobClient.submitJob(conf); return jobClient.monitorAndPrintJob(conf, job); }
@Test public void testCombinerShouldUpdateTheReporter() throws Exception { JobConf conf = new JobConf(mrCluster.getConfig()); int numMaps = 5; int numReds = 2; Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(), "testCombinerShouldUpdateTheReporter-in"); Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(), "testCombinerShouldUpdateTheReporter-out"); createInputOutPutFolder(in, out, numMaps); conf.setJobName("test-job-with-combiner"); conf.setMapperClass(IdentityMapper.class); conf.setCombinerClass(MyCombinerToCheckReporter.class); //conf.setJarByClass(MyCombinerToCheckReporter.class); conf.setReducerClass(IdentityReducer.class); DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf); conf.setOutputCommitter(CustomOutputCommitter.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, in); FileOutputFormat.setOutputPath(conf, out); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); runJob(conf); }
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { // Delegate the generation of input splits to the 'original' InputFormat return ReflectionUtils.newInstance( job.getClass(Submitter.INPUT_FORMAT, TextInputFormat.class, InputFormat.class), job).getSplits(job, numSplits); }
public void configure(JobConf job) { this.fieldSeparator = job.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR, "\t"); this.mapOutputKeyValueSpec = job.get( FieldSelectionHelper.MAP_OUTPUT_KEY_VALUE_SPEC, "0-:"); this.ignoreInputKey = TextInputFormat.class.getCanonicalName().equals( job.getInputFormat().getClass().getCanonicalName()); this.reduceOutputKeyValueSpec = job.get( FieldSelectionHelper.REDUCE_OUTPUT_KEY_VALUE_SPEC, "0-:"); parseOutputKeyValueSpec(); LOG.info(specToString()); }
@Test public void testDefaultToIdentityReducer() throws Exception { args.add("-mapper");args.add(map); args.add("-jobconf"); args.add("mapreduce.task.files.preserve.failedtasks=true"); args.add("-jobconf"); args.add("stream.tmpdir="+System.getProperty("test.build.data","/tmp")); args.add("-inputformat");args.add(TextInputFormat.class.getName()); super.testCommandLine(); }