Java 类org.apache.hadoop.mapreduce.lib.input.FileInputFormat 实例源码

项目：aliyun-maxcompute-data-collectors 文件：TestImportJob.java

@Override
public void configureInputFormat(Job job, String tableName,
    String tableClassName, String splitByCol)
    throws ClassNotFoundException, IOException {

  // Write a line of text into a file so that we can get
  // a record to the map task.
  Path dir = new Path(this.options.getTempDir());
  Path p = new Path(dir, "sqoop-dummy-import-job-file.txt");
  FileSystem fs = FileSystem.getLocal(this.options.getConf());
  if (fs.exists(p)) {
    boolean result = fs.delete(p, false);
    assertTrue("Couldn't delete temp file!", result);
  }

  BufferedWriter w = new BufferedWriter(
      new OutputStreamWriter(fs.create(p)));
  w.append("This is a line!");
  w.close();

  FileInputFormat.addInputPath(job, p);

  // And set the InputFormat itself.
  super.configureInputFormat(job, tableName, tableClassName, splitByCol);
}

项目：LDA 文件：CalParamDriver.java

public static void run(Configuration conf, Path inputPath, Path output, double params) throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "calculating parameter";
    conf.set("params",String.valueOf(params));

    Job job = new Job(conf, jobName);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(indexToCountWritable.class);
    job.setOutputKeyClass(twoDimensionIndexWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(CalParamsMapper.class);
    job.setReducerClass(CalParamsReducer.class);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job,output);

    job.setJarByClass(LDADriver.class);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("calculating parameter failed");
    }
}

项目：DocIT 文件：Cut.java

public static void cut(String name, String in) throws Exception {
    Job job = createJob(name, in);
    job.setNumReduceTasks(0);
    boolean success = job.waitForCompletion(true);

    if (success) {
        // MapReduce reads an input and writes the result to an new location.
        // Hence it can not modify data in place, and we have to replace the input
        // with the output
        Path output = FileOutputFormat.getOutputPath(job);
        FileSystem fs = output.getFileSystem(job.getConfiguration());
        Path[] inputs = FileInputFormat.getInputPaths(job);
        for (int i = 0; i < inputs.length; i++) {
            fs.delete(inputs[i], true);
        }
        fs.rename(output, inputs[0]);
    }
}

项目：Wikipedia-Index 文件：TF.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");

    Job job =Job.getInstance(conf);
    job.setJobName("TermFrequencyCount");
    job.setJarByClass(TF.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntArrayWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(TFMap.class);
    job.setReducerClass(TFReduce.class);

    job.setInputFormatClass(XmlInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean wait = job.waitForCompletion(true);
    System.exit(wait ? 0 : 1);
}

项目：Wikipedia-Index 文件：ExtractPage.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");

    Job job =Job.getInstance(conf);
    job.setJobName("ExrtactPages");
    job.setJarByClass(ExtractPage.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(ExtractPageMap.class);
    job.setReducerClass(ExtractPageReduce.class);

    job.setInputFormatClass(XmlInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean wait = job.waitForCompletion(true);
    System.exit(wait ? 0 : 1);
}

项目：Wikipedia-Index 文件：DF.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    Job job =Job.getInstance(conf);
    job.setJobName("DocumentFrequencyCount");
    job.setJarByClass(DF.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(DFMap.class);
    job.setReducerClass(DFReduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean wait = job.waitForCompletion(true);
    System.exit(wait ? 0 : 1);
}

项目：Wikipedia-Index 文件：MaxThreeLabel.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    Job job =Job.getInstance(conf);
    job.setJobName("MaxThreeLabel");
    job.setJarByClass(MaxThreeLabel.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(TextArrayWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MaxThreeLabelMap.class);
    job.setReducerClass(MaxThreeLabelReduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean wait = job.waitForCompletion(true);
    System.exit(wait ? 0 : 1);
}

项目：Wikipedia-Index 文件：PageWordCount.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");

    Job job =Job.getInstance(conf);
    job.setJobName("PageWordCount");
    job.setJarByClass(PageWordCount.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(PageWordCountMap.class);
    job.setCombinerClass(PageWordCountReduce.class);
    job.setReducerClass(PageWordCountReduce.class);

    job.setInputFormatClass(XmlInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.waitForCompletion(true);
}

项目：Wikipedia-Index 文件：TF_IDF.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    Job job =Job.getInstance(conf);
    job.setJobName("TF-IDFCount");
    job.setJarByClass(TF_IDF.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(TextArrayWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(TF_IDFMap.class);
    job.setReducerClass(TF_IDFReduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.addInputPath(job, new Path(args[1]));
    FileOutputFormat.setOutputPath(job, new Path(args[2]));
    boolean wait = job.waitForCompletion(true);
    System.exit(wait ? 0 : 1);
}

项目：HBase-High-Performance-Cookbook 文件：InputDriver.java

public static void runJob(Path input, Path output, String vectorClassName,Configuration config)
  throws IOException, InterruptedException, ClassNotFoundException {
  Configuration conf = config;
  conf.set("vector.implementation.class.name", vectorClassName);
  Job job = new Job(conf, "Input Driver running over input: " + input);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(VectorWritable.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setMapperClass(InputMapper.class);   
  job.setNumReduceTasks(0);
  job.setJarByClass(InputDriver.class);

  FileInputFormat.addInputPath(job, input);
  FileOutputFormat.setOutputPath(job, output);

  job.waitForCompletion(true);
}

项目：Hadoop-Codes 文件：MaximumAverageDriver.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "maxaverage");

    job.setMapperClass(MaximumAverageMapper.class);
    job.setReducerClass(MaximumAverageReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FloatWritable.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (!job.waitForCompletion(true))
        return;
}

项目：Hadoop-Codes 文件：testDriver.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "test");

    job.setMapperClass(testMapper.class);
    job.setPartitionerClass(testPartitioner.class);
    job.setReducerClass(testReducer.class);
    job.setNumReduceTasks(10);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (!job.waitForCompletion(true))
        return;
}

项目：Hadoop-Codes 文件：SystemUserDriver.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "systemuser");

    //job.setMapperClass(SystemUserMapper.class);
    job.setMapperClass(DailyCount.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (!job.waitForCompletion(true))
        return;
}

项目：Hadoop-Codes 文件：MaxTempDriver.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "maxtemp");

    job.setMapperClass(MaxTempMapper.class);
    job.setReducerClass(MaxTempReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FloatWritable.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (!job.waitForCompletion(true))
        return;
}

项目：aliyun-maxcompute-data-collectors 文件：ExportJobBase.java

@Override
protected void configureInputFormat(Job job, String tableName,
    String tableClassName, String splitByCol)
    throws ClassNotFoundException, IOException {

 if (options.getOdpsTable() != null) {
    Configuration conf = job.getConfiguration();
    setInputFormatClass(OdpsExportInputFormat.class);
    conf.set(OdpsConstants.TABLE_NAME, options.getOdpsTable());
    conf.set(OdpsConstants.ACCESS_ID, options.getOdpsAccessID());
    conf.set(OdpsConstants.ACCESS_KEY, options.getOdpsAccessKey());
    conf.set(OdpsConstants.ENDPOINT, options.getOdpsEndPoint());
    conf.set(OdpsConstants.PROJECT, options.getOdpsProject());
    String partitionSpec = options.getOdpsPartitionSpec();
    if (partitionSpec != null) {
      conf.set(OdpsConstants.PARTITION_SPEC, partitionSpec);
    }
    setMapperClass(OdpsExportMapper.class);
  }
  super.configureInputFormat(job, tableName, tableClassName, splitByCol);
  if (!isHCatJob && options.getOdpsTable() == null) {
    FileInputFormat.addInputPath(job, getInputPath());
  }

}

项目：hadoop 文件：CredentialsTestJob.java

public Job createJob() 
throws IOException {
  Configuration conf = getConf();
  conf.setInt(MRJobConfig.NUM_MAPS, 1);
  Job job = Job.getInstance(conf, "test");
  job.setNumReduceTasks(1);
  job.setJarByClass(CredentialsTestJob.class);
  job.setNumReduceTasks(1);
  job.setMapperClass(CredentialsTestJob.CredentialsTestMapper.class);
  job.setMapOutputKeyClass(IntWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(CredentialsTestJob.CredentialsTestReducer.class);
  job.setInputFormatClass(SleepJob.SleepInputFormat.class);
  job.setPartitionerClass(SleepJob.SleepJobPartitioner.class);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setSpeculativeExecution(false);
  job.setJobName("test job");
  FileInputFormat.addInputPath(job, new Path("ignored"));
  return job;
}

项目：hadoop 文件：FailJob.java

public Job createJob(boolean failMappers, boolean failReducers, Path inputFile) 
    throws IOException {
  Configuration conf = getConf();
  conf.setBoolean(FAIL_MAP, failMappers);
  conf.setBoolean(FAIL_REDUCE, failReducers);
  Job job = Job.getInstance(conf, "fail");
  job.setJarByClass(FailJob.class);
  job.setMapperClass(FailMapper.class);
  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(FailReducer.class);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setSpeculativeExecution(false);
  job.setJobName("Fail job");
  FileInputFormat.addInputPath(job, inputFile);
  return job;
}

项目：hadoop 文件：MapReduceTestUtil.java

/**
 * Creates a simple copy job.
 * 
 * @param conf Configuration object
 * @param outdir Output directory.
 * @param indirs Comma separated input directories.
 * @return Job initialized for a data copy job.
 * @throws Exception If an error occurs creating job configuration.
 */
public static Job createCopyJob(Configuration conf, Path outdir, 
    Path... indirs) throws Exception {
  conf.setInt(MRJobConfig.NUM_MAPS, 3);
  Job theJob = Job.getInstance(conf);
  theJob.setJobName("DataMoveJob");

  FileInputFormat.setInputPaths(theJob, indirs);
  theJob.setMapperClass(DataCopyMapper.class);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  theJob.setReducerClass(DataCopyReducer.class);
  theJob.setNumReduceTasks(1);
  return theJob;
}

项目：hadoop 文件：MapReduceTestUtil.java

/**
 * Creates a simple fail job.
 * 
 * @param conf Configuration object
 * @param outdir Output directory.
 * @param indirs Comma separated input directories.
 * @return Job initialized for a simple fail job.
 * @throws Exception If an error occurs creating job configuration.
 */
public static Job createFailJob(Configuration conf, Path outdir, 
    Path... indirs) throws Exception {
  FileSystem fs = outdir.getFileSystem(conf);
  if (fs.exists(outdir)) {
    fs.delete(outdir, true);
  }
  conf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, 2);
  Job theJob = Job.getInstance(conf);
  theJob.setJobName("Fail-Job");

  FileInputFormat.setInputPaths(theJob, indirs);
  theJob.setMapperClass(FailMapper.class);
  theJob.setReducerClass(Reducer.class);
  theJob.setNumReduceTasks(0);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  return theJob;
}

项目：hadoop 文件：MapReduceTestUtil.java

public static Job createJob(Configuration conf, Path inDir, Path outDir, 
    int numInputFiles, int numReds, String input) throws IOException {
  Job job = Job.getInstance(conf);
  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }
  if (fs.exists(inDir)) {
    fs.delete(inDir, true);
  }
  fs.mkdirs(inDir);
  for (int i = 0; i < numInputFiles; ++i) {
    DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
    file.writeBytes(input);
    file.close();
  }    

  FileInputFormat.setInputPaths(job, inDir);
  FileOutputFormat.setOutputPath(job, outDir);
  job.setNumReduceTasks(numReds);
  return job;
}

项目：hadoop 文件：TestMapReduceLazyOutput.java

private static void runTestLazyOutput(Configuration conf, Path output,
    int numReducers, boolean createLazily) 
throws Exception {
  Job job = Job.getInstance(conf, "Test-Lazy-Output");

  FileInputFormat.setInputPaths(job, INPUT);
  FileOutputFormat.setOutputPath(job, output);

  job.setJarByClass(TestMapReduceLazyOutput.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(numReducers);

  job.setMapperClass(TestMapper.class);
  job.setReducerClass(TestReducer.class);

  if (createLazily) {
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
  } else {
    job.setOutputFormatClass(TextOutputFormat.class);
  }
  assertTrue(job.waitForCompletion(true));
}

项目：hadoop 文件：SleepJob.java

public Job createJob(int numMapper, int numReducer, 
                     long mapSleepTime, int mapSleepCount, 
                     long reduceSleepTime, int reduceSleepCount) 
    throws IOException {
  Configuration conf = getConf();
  conf.setLong(MAP_SLEEP_TIME, mapSleepTime);
  conf.setLong(REDUCE_SLEEP_TIME, reduceSleepTime);
  conf.setInt(MAP_SLEEP_COUNT, mapSleepCount);
  conf.setInt(REDUCE_SLEEP_COUNT, reduceSleepCount);
  conf.setInt(MRJobConfig.NUM_MAPS, numMapper);
  Job job = Job.getInstance(conf, "sleep");
  job.setNumReduceTasks(numReducer);
  job.setJarByClass(SleepJob.class);
  job.setMapperClass(SleepMapper.class);
  job.setMapOutputKeyClass(IntWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(SleepReducer.class);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setInputFormatClass(SleepInputFormat.class);
  job.setPartitionerClass(SleepJobPartitioner.class);
  job.setSpeculativeExecution(false);
  job.setJobName("Sleep job");
  FileInputFormat.addInputPath(job, new Path("ignored"));
  return job;
}

项目：hadoop 文件：TestLocalRunner.java

/**
 * Run a test with a misconfigured number of mappers.
 * Expect failure.
 */
@Test
public void testInvalidMultiMapParallelism() throws Exception {
  Job job = Job.getInstance();

  Path inputPath = createMultiMapsInput();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);

  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  job.setMapperClass(StressMapper.class);
  job.setReducerClass(CountingReducer.class);
  job.setNumReduceTasks(1);
  LocalJobRunner.setLocalMaxRunningMaps(job, -6);
  FileInputFormat.addInputPath(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  boolean success = job.waitForCompletion(true);
  assertFalse("Job succeeded somehow", success);
}

项目：hadoop 文件：LocatedFileStatusFetcher.java

/**
 * @param conf configuration for the job
 * @param dirs the initial list of paths
 * @param recursive whether to traverse the patchs recursively
 * @param inputFilter inputFilter to apply to the resulting paths
 * @param newApi whether using the mapred or mapreduce API
 * @throws InterruptedException
 * @throws IOException
 */
public LocatedFileStatusFetcher(Configuration conf, Path[] dirs,
    boolean recursive, PathFilter inputFilter, boolean newApi) throws InterruptedException,
    IOException {
  int numThreads = conf.getInt(FileInputFormat.LIST_STATUS_NUM_THREADS,
      FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS);
  rawExec = Executors.newFixedThreadPool(
      numThreads,
      new ThreadFactoryBuilder().setDaemon(true)
          .setNameFormat("GetFileInfo #%d").build());
  exec = MoreExecutors.listeningDecorator(rawExec);
  resultQueue = new LinkedBlockingQueue<List<FileStatus>>();
  this.conf = conf;
  this.inputDirs = dirs;
  this.recursive = recursive;
  this.inputFilter = inputFilter;
  this.newApi = newApi;
}

项目：hadoop 文件：ControlledJob.java

/**
 * Submit this job to mapred. The state becomes RUNNING if submission 
 * is successful, FAILED otherwise.  
 */
protected synchronized void submit() {
  try {
    Configuration conf = job.getConfiguration();
    if (conf.getBoolean(CREATE_DIR, false)) {
      FileSystem fs = FileSystem.get(conf);
      Path inputPaths[] = FileInputFormat.getInputPaths(job);
      for (int i = 0; i < inputPaths.length; i++) {
        if (!fs.exists(inputPaths[i])) {
          try {
            fs.mkdirs(inputPaths[i]);
          } catch (IOException e) {

          }
        }
      }
    }
    job.submit();
    this.state = State.RUNNING;
  } catch (Exception ioe) {
    LOG.info(getJobName()+" got an error while submitting ",ioe);
    this.state = State.FAILED;
    this.message = StringUtils.stringifyException(ioe);
  }
}