Java 类org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat 实例源码

项目：search-1047 文件：OutLinkNum.java

public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: OutLinks <input path> <output path>");
    System.exit(-1);
  }

  Job job = new Job();
  job.setJarByClass(OutLinks.class);

  FileInputFormat.addInputPath(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));

  job.setOutputFormatClass(MapFileOutputFormat.class);
  job.setMapperClass(OutLinkNumMapper.class);
  job.setReducerClass(OutLinkNumReducer.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(IntWritable.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(DoubleWritable.class);

  job.waitForCompletion(true);
}

项目：search-1047 文件：UrlModulus.java

public static void main(String[] args) throws Exception {
  if(args.length != 2) {
    System.err.println("Usage: UrlModulus <input path> <output path>");
    System.exit(-1);
  }

  Configuration conf = new Configuration();
  Job job = new Job(conf, "UrlModulus");
  job.setJarByClass(UrlModulus.class);

  FileInputFormat.addInputPath(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setOutputFormatClass(MapFileOutputFormat.class);

  job.setMapperClass(UrlModulusMapper.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(DoubleWritable.class);

  job.setReducerClass(UrlModulusReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(DoubleWritable.class);

  job.setNumReduceTasks(1);
  job.waitForCompletion(true);
}

项目：search-1047 文件：PageRank.java

public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: OutLinks <input path> <output path>");
    System.exit(-1);
  }

  Job job = new Job();
  job.setJarByClass(PageRank.class);

  FileInputFormat.addInputPath(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));

  job.setOutputFormatClass(MapFileOutputFormat.class);
  job.setMapperClass(PageRankMapper.class);
  job.setReducerClass(PageRankReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(DoubleWritable.class);

  PageRankRead = new MapFileRead("/input/PageRankMap");
  //OutLinksRead = new MapFileRead("/input/OutLinksMap");

  job.waitForCompletion(true);
}

项目：pagerank-hadoop 文件：PageRank.java

private int getNumPages(Configuration conf, Path titlesDir)
        throws Exception {

    int numPages = 0;

    IntWritable pageNumber = new IntWritable();
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    for (int i = 0; i < readers.length; i++) {
        readers[i].finalKey(pageNumber);
        if (pageNumber.get() > numPages) {
            numPages = pageNumber.get();
        }
    }
    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    return numPages;
}

项目：hadoop-in-action 文件：LookupRecordByTemperature.java

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        JobBuilder.printUsage(this, "<path> <key>");
        return -1;
    }
    Path path = new Path(args[0]);
    IntWritable key = new IntWritable(Integer.parseInt(args[1]));

    Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    Text val = new Text();
    Writable entry = MapFileOutputFormat.getEntry(readers, partitioner,
            key, val);
    if (entry == null) {
        System.err.println("Key not found: " + key);
        return -1;
    }
    NcdcRecordParser parser = new NcdcRecordParser();
    parser.parse(val.toString());
    System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
    return 0;
}

项目：hadoop-in-action 文件：SortByTemperatureToMapFile.java

@Override
public int run(String[] args) throws Exception {
    Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (job == null) {
        return -1;
    }

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    return job.waitForCompletion(true) ? 0 : 1;
}

项目：hbase 文件：HashTable.java

public Job createSubmittableJob(String[] args) throws IOException {
  Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
  generatePartitions(partitionsPath);

  Job job = Job.getInstance(getConf(),
        getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
  Configuration jobConf = job.getConfiguration();
  jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
  job.setJarByClass(HashTable.class);

  TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
      HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);

  // use a TotalOrderPartitioner and reducers to group region output into hash files
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
  job.setReducerClass(Reducer.class);  // identity reducer
  job.setNumReduceTasks(tableHash.numHashFiles);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(ImmutableBytesWritable.class);
  job.setOutputFormatClass(MapFileOutputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));

  return job;
}

项目：ditb 文件：HashTable.java

public Job createSubmittableJob(String[] args) throws IOException {
  Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
  generatePartitions(partitionsPath);

  Job job = Job.getInstance(getConf(),
        getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
  Configuration jobConf = job.getConfiguration();
  jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
  job.setJarByClass(HashTable.class);

  TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
      HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);

  // use a TotalOrderPartitioner and reducers to group region output into hash files
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
  job.setReducerClass(Reducer.class);  // identity reducer
  job.setNumReduceTasks(tableHash.numHashFiles);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(ImmutableBytesWritable.class);
  job.setOutputFormatClass(MapFileOutputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));

  return job;
}

项目：pagerank-hadoop 文件：TitleIndex.java

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.out.println("Usage: TitleIndex <titles-sorted.txt> <output-dir>");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }

    Path titlesFile = new Path(args[0]);
    Path outputDir = new Path(args[1]);

    Configuration conf = getConf();

    // Do not create _SUCCESS files. MapFileOutputFormat.getReaders calls
    // try to read the _SUCCESS as another MapFile dir.
    conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false");

    // This job creates a MapFile of the titles indexed by the page id.
    // UnsplittableTextInputFormat is used to ensure that the same map task
    // gets all the lines in the titlesFile and it can count the line
    // numbers. The number of reduce tasks is set to 0.

    Job job = Job.getInstance(conf, "TitleIndex");

    job.setJarByClass(InLinks.class);
    job.setInputFormatClass(UnsplittableTextInputFormat.class);
    job.setMapperClass(TitleIndexMapper.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, titlesFile);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setNumReduceTasks(0);
    job.waitForCompletion(true);

    return 0;
}

项目：pagerank-hadoop 文件：PageRankTopNReducer.java

@Override
protected void cleanup(Context context) throws IOException,
        InterruptedException {

    Configuration conf = context.getConfiguration();
    Path titlesDir = new Path(conf.get("pagerank.titles_dir"));

    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    IntWritable page = new IntWritable();
    Text title = new Text();

    float[] pageRanks = new float[topN.size()];
    String[] titles = new String[topN.size()];

    // The order of the entries is reversed. The priority queue is in
    // non-decreasing order and we want the highest PageRank first.
    for (int i = pageRanks.length - 1; i >= 0; i--) {
        Map.Entry<Float, Integer> entry = topN.poll();
        // Get the title of the page from the title index.
        page.set(entry.getValue());
        MapFileOutputFormat.getEntry(readers, partitioner, page, title);
        pageRanks[i] = entry.getKey();
        titles[i] = title.toString();
    }

    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    for (int i = 0; i < pageRanks.length; i++) {
        context.write(new FloatWritable(pageRanks[i]), new Text(titles[i]));
    }
}

项目：pagerank-hadoop 文件：PageRank.java

private void pageRankIteration(int iter, Configuration conf, Path outputDir)
        throws Exception {

    // This job performs an iteration of the power iteration method to
    // compute PageRank. The map task processes each block M_{i,j}, loads
    // the corresponding stripe j of the vector v_{k-1} and produces the
    // partial result of the stripe i of the vector v_k. The reduce task
    // sums all the partial results of v_k and adds the teleportation factor
    // (the combiner only sums all the partial results). See Section 5.2
    // (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
    // output is written in a "vk" subdir of the output dir, where k is the
    // iteration number. MapFileOutputFormat is used to keep an array of the
    // stripes of v.

    Job job = Job.getInstance(conf, "PageRank:Iteration");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankIterationMapper.class);
    job.setMapOutputKeyClass(ShortWritable.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setCombinerClass(PageRankIterationCombiner.class);
    job.setReducerClass(PageRankIterationReducer.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(ShortWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));

    job.waitForCompletion(true);
}

项目：pagerank-hadoop 文件：InLinksTopNReducer.java

@Override
protected void cleanup(Context context) throws IOException,
        InterruptedException {

    Configuration conf = context.getConfiguration();
    Path titlesDir = new Path(conf.get("inlinks.titles_dir"));

    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    IntWritable page = new IntWritable();
    Text title = new Text();

    int[] inLinks = new int[topN.size()];
    String[] titles = new String[topN.size()];

    for (int i = inLinks.length - 1; i >= 0; i--) {
        Map.Entry<Integer, Integer> entry = topN.poll();
        page.set(entry.getValue());
        MapFileOutputFormat.getEntry(readers, partitioner, page, title);
        inLinks[i] = entry.getKey();
        titles[i] = title.toString();
    }

    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    for (int i = 0; i < inLinks.length; i++) {
        context.write(new IntWritable(inLinks[i]), new Text(titles[i]));
    }
}

项目：hadoop-in-action 文件：LookupRecordsByTemperature.java

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        JobBuilder.printUsage(this, "<path> <key>");
        return -1;
    }
    Path path = new Path(args[0]);
    IntWritable key = new IntWritable(Integer.parseInt(args[1]));

    Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    Text val = new Text();

    Reader reader = readers[partitioner.getPartition(key, val,
            readers.length)];
    Writable entry = reader.get(key, val);
    if (entry == null) {
        System.err.println("Key not found: " + key);
        return -1;
    }
    NcdcRecordParser parser = new NcdcRecordParser();
    IntWritable nextKey = new IntWritable();
    do {
        parser.parse(val.toString());
        System.out.printf("%s\t%s\n", parser.getStationId(),
                parser.getYear());
    } while (reader.next(nextKey, val) && key.equals(nextKey));
    return 0;
}

项目：pagerank-hadoop 文件：PageRankIterationMapper.java

@Override
public void map(ShortArrayWritable inKey, MatrixBlockWritable inValue,
        Context context) throws IOException, InterruptedException {

    // This task gets each block M_{i,j}, loads the corresponding stripe j
    // of the vector v_{k-1} and produces the partial result of the stripe i
    // of the vector v_k.

    Configuration conf = context.getConfiguration();
    int iter = Integer.parseInt(conf.get("pagerank.iteration"));
    int numPages = Integer.parseInt(conf.get("pagerank.num_pages"));
    short blockSize = Short.parseShort(conf.get("pagerank.block_size"));

    Writable[] blockIndexes = inKey.get();
    short i = ((ShortWritable) blockIndexes[0]).get();
    short j = ((ShortWritable) blockIndexes[1]).get();

    int vjSize = (j > numPages / blockSize) ? (numPages % blockSize) : blockSize;
    FloatWritable[] vj = new FloatWritable[vjSize];

    if (iter == 1) {
        // Initial PageRank vector with 1/n for all pages.
        for (int k = 0; k < vj.length; k++) {
            vj[k] = new FloatWritable(1.0f / numPages);
        }
    } else {
        // Load the stripe j of the vector v_{k-1} from the MapFiles.
        Path outputDir = MapFileOutputFormat.getOutputPath(context).getParent();
        Path vjDir = new Path(outputDir, "v" + (iter - 1));
        MapFile.Reader[] readers = MapFileOutputFormat.getReaders(vjDir, conf);
        Partitioner<ShortWritable, FloatArrayWritable> partitioner =
                new HashPartitioner<ShortWritable, FloatArrayWritable>();
        ShortWritable key = new ShortWritable(j);
        FloatArrayWritable value = new FloatArrayWritable();
        MapFileOutputFormat.getEntry(readers, partitioner, key, value);
        Writable[] writables = value.get();
        for (int k = 0; k < vj.length; k++) {
            vj[k] = (FloatWritable) writables[k];
        }
        for (MapFile.Reader reader : readers) {
            reader.close();
        }
    }

    // Initialize the partial result i of the vector v_k.
    int viSize = (i > numPages / blockSize) ? (numPages % blockSize) : blockSize;
    FloatWritable[] vi = new FloatWritable[viSize];
    for (int k = 0; k < vi.length; k++) {
        vi[k] = new FloatWritable(0);
    }

    // Multiply M_{i,j} by the stripe j of the vector v_{k-1} to obtain the
    // partial result i of the vector v_k.
    Writable[][] blockColumns = inValue.get();
    for (int k = 0; k < blockColumns.length; k++) {
        Writable[] blockColumn = blockColumns[k];
        if (blockColumn.length > 0) {
            int vDegree = ((ShortWritable) blockColumn[0]).get();
            for (int columnIndex = 1; columnIndex < blockColumn.length; columnIndex++) {
                int l = ((ShortWritable) blockColumn[columnIndex]).get();
                vi[l].set(vi[l].get() +  (1.0f / vDegree) * vj[k].get());
            }
        }
    }

    context.write(new ShortWritable(i), new FloatArrayWritable(vi));
}

项目：DataVec 文件：SparkStorageUtils.java

/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link RecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the MapFile
 * @param rdd            RDD to save
 * @param c              Configuration object, used to customise options for the map file
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output map files
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c,
                @Nullable Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count
    JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class,
                    c);
}

项目：DataVec 文件：SparkStorageUtils.java

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link SequenceRecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br>
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path Path to save the MapFile
 * @param rdd  RDD to save
 * @param c    Configuration object, used to customise options for the map file
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c,
                @Nullable Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex();
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    MapFileOutputFormat.class, c);
}