Java 类org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat 实例源码

项目:LDA    文件:CalParamDriver.java   
public static void run(Configuration conf, Path inputPath, Path output, double params) throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "calculating parameter";
    conf.set("params",String.valueOf(params));

    Job job = new Job(conf, jobName);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(indexToCountWritable.class);
    job.setOutputKeyClass(twoDimensionIndexWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(CalParamsMapper.class);
    job.setReducerClass(CalParamsReducer.class);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job,output);

    job.setJarByClass(LDADriver.class);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("calculating parameter failed");
    }
}
项目:HBase-High-Performance-Cookbook    文件:InputDriver.java   
public static void runJob(Path input, Path output, String vectorClassName,Configuration config)
  throws IOException, InterruptedException, ClassNotFoundException {
  Configuration conf = config;
  conf.set("vector.implementation.class.name", vectorClassName);
  Job job = new Job(conf, "Input Driver running over input: " + input);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(VectorWritable.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setMapperClass(InputMapper.class);   
  job.setNumReduceTasks(0);
  job.setJarByClass(InputDriver.class);

  FileInputFormat.addInputPath(job, input);
  FileOutputFormat.setOutputPath(job, output);

  job.waitForCompletion(true);
}
项目:aliyun-maxcompute-data-collectors    文件:DataDrivenImportJob.java   
@Override
protected Class<? extends OutputFormat> getOutputFormatClass()
    throws ClassNotFoundException {
  if (isHCatJob) {
    LOG.debug("Returning HCatOutputFormat for output format");
    return SqoopHCatUtilities.getOutputFormatClass();
  }
  if (options.getFileLayout() == SqoopOptions.FileLayout.TextFile) {
    return RawKeyTextOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.SequenceFile) {
    return SequenceFileOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.AvroDataFile) {
    return AvroOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.ParquetFile) {
    return DatasetKeyOutputFormat.class;
  }

  return null;
}
项目:hadoop    文件:TestJoinDatamerge.java   
private static void joinAs(String jointype, 
    Class<? extends SimpleCheckerMapBase<?>> map, 
    Class<? extends SimpleCheckerReduceBase> reduce) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  conf.setInt("testdatamerge.sources", srcs);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(map);
  job.setReducerClass(reduce);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  job.waitForCompletion(true);
  assertTrue("Job failed", job.isSuccessful());
  if ("outer".equals(jointype)) {
    checkOuterConsistency(job, src);
  }
  base.getFileSystem(conf).delete(base, true);
}
项目:hadoop    文件:RandomTextWriterJob.java   
public Job createJob(Configuration conf) throws IOException {
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024);
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap);
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);

  job.setJarByClass(RandomTextWriterJob.class);
  job.setJobName("random-text-writer");

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);

  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  //FileOutputFormat.setOutputPath(job, new Path("random-output"));
  job.setNumReduceTasks(0);
  return job;
}
项目:LDA    文件:InputDriver.java   
public static void runJob(Configuration conf, Path inputPath, Path output) throws IOException, ClassNotFoundException, InterruptedException {

        Job job = new Job(conf, "Input Drive running input:"+inputPath);
        log.info("start running InputDriver");
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(indexToWordWritable.class);
        job.setOutputKeyClass(twoDimensionIndexWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapperClass(InputMapper.class);
        job.setReducerClass(InputReducer.class);
        job.setNumReduceTasks(1);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setJarByClass(InputDriver.class);

        FileInputFormat.addInputPath(job, inputPath);
        FileOutputFormat.setOutputPath(job, output);

        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }

    }
项目:oryx2    文件:SaveToHDFSFunction.java   
@Override
public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException {
  if (rdd.isEmpty()) {
    log.info("RDD was empty, not saving to HDFS");
  } else {
    String file = prefix + "-" + time.milliseconds() + "." + suffix;
    Path path = new Path(file);
    FileSystem fs = FileSystem.get(path.toUri(), hadoopConf);
    if (fs.exists(path)) {
      log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path);
      fs.delete(path, true);
    }
    log.info("Saving RDD to HDFS at {}", file);
    rdd.mapToPair(
        new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)
    ).saveAsNewAPIHadoopFile(
        file,
        keyWritableClass,
        messageWritableClass,
        SequenceFileOutputFormat.class,
        hadoopConf);
  }
}
项目:ditb    文件:Export.java   
/**
 * Sets up the actual job.
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
  String tableName = args[0];
  Path outputDir = new Path(args[1]);
  Job job = new Job(conf, NAME + "_" + tableName);
  job.setJobName(NAME + "_" + tableName);
  job.setJarByClass(Export.class);
  // Set optional scan parameters
  Scan s = getConfiguredScanForJob(conf, args);
  IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
  // No reducers.  Just write straight to output files.
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Result.class);
  FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
  return job;
}
项目:ditb    文件:IntegrationTestBigLinkedList.java   
public int runRandomInputGenerator(int numMappers, long numNodes, Path tmpOutput,
    Integer width, Integer wrapMuplitplier) throws Exception {
  LOG.info("Running RandomInputGenerator with numMappers=" + numMappers
      + ", numNodes=" + numNodes);
  Job job = Job.getInstance(getConf());

  job.setJobName("Random Input Generator");
  job.setNumReduceTasks(0);
  job.setJarByClass(getClass());

  job.setInputFormatClass(GeneratorInputFormat.class);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(NullWritable.class);

  setJobConf(job, numMappers, numNodes, width, wrapMuplitplier);

  job.setMapperClass(Mapper.class); //identity mapper

  FileOutputFormat.setOutputPath(job, tmpOutput);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  boolean success = jobCompletion(job);

  return success ? 0 : 1;
}
项目:aliyun-oss-hadoop-fs    文件:TestJoinDatamerge.java   
private static void joinAs(String jointype, 
    Class<? extends SimpleCheckerMapBase<?>> map, 
    Class<? extends SimpleCheckerReduceBase> reduce) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  conf.setInt("testdatamerge.sources", srcs);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(map);
  job.setReducerClass(reduce);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  job.waitForCompletion(true);
  assertTrue("Job failed", job.isSuccessful());
  if ("outer".equals(jointype)) {
    checkOuterConsistency(job, src);
  }
  base.getFileSystem(conf).delete(base, true);
}
项目:aliyun-oss-hadoop-fs    文件:RandomTextWriterJob.java   
public Job createJob(Configuration conf) throws IOException {
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024);
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap);
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);

  job.setJarByClass(RandomTextWriterJob.class);
  job.setJobName("random-text-writer");

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);

  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  //FileOutputFormat.setOutputPath(job, new Path("random-output"));
  job.setNumReduceTasks(0);
  return job;
}
项目:elasticsearch-mapreduce    文件:Es2Json.java   
public static void Run(String output, String outputFormat, int reducerNum, Configuration conf) 
            throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(conf);
//        job.setJobName(Es2Json.class.getName());
        job.setJarByClass(Es2Json.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setMapperClass(MapTask.class);
        job.setReducerClass(ReduceTask.class);
        job.setInputFormatClass(EsInputFormat.class);

        if (outputFormat.equals("sequencefile")) {
            job.setOutputFormatClass(SequenceFileOutputFormat.class);
        }

        job.setNumReduceTasks(reducerNum);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setSpeculativeExecution(false);
        job.waitForCompletion(true);
    }
项目:big-c    文件:TestJoinDatamerge.java   
private static void joinAs(String jointype, 
    Class<? extends SimpleCheckerMapBase<?>> map, 
    Class<? extends SimpleCheckerReduceBase> reduce) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  conf.setInt("testdatamerge.sources", srcs);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(map);
  job.setReducerClass(reduce);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  job.waitForCompletion(true);
  assertTrue("Job failed", job.isSuccessful());
  if ("outer".equals(jointype)) {
    checkOuterConsistency(job, src);
  }
  base.getFileSystem(conf).delete(base, true);
}
项目:big-c    文件:RandomTextWriterJob.java   
public Job createJob(Configuration conf) throws IOException {
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024);
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap);
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);

  job.setJarByClass(RandomTextWriterJob.class);
  job.setJobName("random-text-writer");

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);

  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  //FileOutputFormat.setOutputPath(job, new Path("random-output"));
  job.setNumReduceTasks(0);
  return job;
}
项目:LiteGraph    文件:OutputFormatRDD.java   
@Override
public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, JavaPairRDD<K, V> memoryRDD) {
    final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration);
    final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
    if (null != outputLocation) {
        // map back to a Hadoop stream for output
        memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
                .saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey),
                        ObjectWritable.class,
                        ObjectWritable.class,
                        SequenceFileOutputFormat.class, hadoopConfiguration);
        try {
            return (Iterator) new ObjectWritableIterator(hadoopConfiguration, new Path(Constants.getMemoryLocation(outputLocation, memoryKey)));
        } catch (final IOException e) {
            throw new IllegalStateException(e.getMessage(), e);
        }
    }
    return Collections.emptyIterator();
}
项目:TopPI    文件:TopPIoverHadoop.java   
private boolean bigItemCount(String output) throws IOException, ClassNotFoundException, InterruptedException {
    Job job = Job.getInstance(this.getConf(), "Counting items from " + this.input);
    job.setJarByClass(TopPIoverHadoop.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(this.input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setMapperClass(ItemBigCountingMapper.class);
    job.setReducerClass(ItemBigCountingReducer.class);

    boolean success = job.waitForCompletion(true);

    if (success) {
        Counter rebasingMaxID = job.getCounters().findCounter(TaskCounter.REDUCE_OUTPUT_RECORDS);
        this.getConf().setInt(KEY_REBASING_MAX_ID, (int) rebasingMaxID.getValue());
    }

    return success;
}
项目:TopPI    文件:TopPIoverHadoop.java   
private boolean genBigItemMap(String input, String output) throws IOException, ClassNotFoundException,
        InterruptedException {
    Job job = Job.getInstance(this.getConf(), "Computing items remapping for " + this.input);
    job.setJarByClass(TopPIoverHadoop.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setMapperClass(InverseMapper.class);
    job.setReducerClass(ItemBigRebasingReducer.class);
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true);
}
项目:TopPI    文件:TopPIoverHadoop.java   
private boolean filterInput(String output, String rebasingMapPath) throws IOException, ClassNotFoundException,
        InterruptedException {
    Job job = Job.getInstance(this.getConf(), "Computing items remapping for " + this.input);
    job.setJarByClass(TopPIoverHadoop.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(ConcatenatedTransactionsWritable.class);
    DistCache.copyToCache(job, rebasingMapPath);
    FileInputFormat.addInputPath(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setMapperClass(FilteringMapper.class);
    job.setNumReduceTasks(0);

    return job.waitForCompletion(true);
}
项目:iis    文件:SoftwareExporterJob.java   
private static JavaRDD<Tuple3<String, String, Float>> handleRelations(JavaRDD<DocumentToSoftwareUrlWithMeta> documentToSoftwareUrl, String actionSetId, 
        Configuration jobConfig, String outputAvroPath) {
    JavaRDD<Tuple3<String, String, Float>> distinctRelationTriples = documentToSoftwareUrl
            .map(e -> new Tuple3<>(e.getDocumentId().toString(), generateSoftwareEntityId(pickUrl(e)), e.getConfidenceLevel()))
            .distinct();

    JavaPairRDD<String, Tuple3<String, String, Float>> relationTriplesByIdPair = distinctRelationTriples
            .mapToPair(e -> new Tuple2<String, Tuple3<String, String, Float>>(
                    joinDocumentAndSoftwareIds(e._1(), e._2()), e));

    JavaRDD<Tuple3<String, String, Float>> dedupedRelationTriples = relationTriplesByIdPair
            .reduceByKey((x, y) -> pickBestConfidence(x, y)).values();
    // to be used by both entity exporter and reporter consumers
    dedupedRelationTriples.cache();

    JavaPairRDD<Text, Text> relationResult = dedupedRelationTriples.flatMapToPair(x -> (Iterable<Tuple2<Text, Text>>) 
            buildRelationActions(x._1(), x._2(), x._3(), actionSetId).stream()
            .map(action -> new Tuple2<Text, Text>(new Text(action.getRowKey()),
                    new Text(action.toString())))::iterator);
    relationResult.coalesce(numberOfOutputFiles).saveAsNewAPIHadoopFile(outputAvroPath, Text.class, Text.class, SequenceFileOutputFormat.class, jobConfig);

    return dedupedRelationTriples;
}
项目:LCIndex-HBase-0.94.16    文件:IntegrationTestBigLinkedList.java   
public int runRandomInputGenerator(int numMappers, long numNodes, Path tmpOutput,
    Integer width, Integer wrapMuplitplier) throws Exception {
  LOG.info("Running RandomInputGenerator with numMappers=" + numMappers
      + ", numNodes=" + numNodes);
  Job job = new Job(getConf());

  job.setJobName("Random Input Generator");
  job.setNumReduceTasks(0);
  job.setJarByClass(getClass());

  job.setInputFormatClass(GeneratorInputFormat.class);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(NullWritable.class);

  setJobConf(job, numMappers, numNodes, width, wrapMuplitplier);

  job.setMapperClass(Mapper.class); //identity mapper

  FileOutputFormat.setOutputPath(job, tmpOutput);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  boolean success = job.waitForCompletion(true);

  return success ? 0 : 1;
}
项目:sPCA    文件:NormalizeJob.java   
public void run(Configuration conf, Path matrixInputPath,
    String meanSpanFileName, Path matrixOutputPath, double sampleRate)
    throws IOException, InterruptedException, ClassNotFoundException {
  conf.set(MEANSPANOPTION, meanSpanFileName);
  conf.setFloat(SAMPLERATE, (float) sampleRate);
  Job job = new Job(conf);
  job.setJobName("Normalize");
  job.setJarByClass(NormalizeJob.class);
  FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
  matrixInputPath = fs.makeQualified(matrixInputPath);
  matrixOutputPath = fs.makeQualified(matrixOutputPath);
  FileInputFormat.addInputPath(job, matrixInputPath);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, matrixOutputPath);
  job.setMapperClass(NormalizeMapper.class);
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(VectorWritable.class);
  job.submit();
  job.waitForCompletion(true);
}
项目:hadoop-2.6.0-cdh5.4.3    文件:RandomTextWriterJob.java   
public Job createJob(Configuration conf) throws IOException {
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024);
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap);
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = new Job(conf);

  job.setJarByClass(RandomTextWriterJob.class);
  job.setJobName("random-text-writer");

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);

  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  //FileOutputFormat.setOutputPath(job, new Path("random-output"));
  job.setNumReduceTasks(0);
  return job;
}
项目:kylin    文件:KafkaFlatTableJob.java   
private void setupMapper(CubeSegment cubeSeg) throws IOException {
    // set the segment's offset info to job conf
    Map<Integer, Long> offsetStart = cubeSeg.getSourcePartitionOffsetStart();
    Map<Integer, Long> offsetEnd = cubeSeg.getSourcePartitionOffsetEnd();

    Integer minPartition = Collections.min(offsetStart.keySet());
    Integer maxPartition = Collections.max(offsetStart.keySet());
    job.getConfiguration().set(CONFIG_KAFKA_PARITION_MIN, minPartition.toString());
    job.getConfiguration().set(CONFIG_KAFKA_PARITION_MAX, maxPartition.toString());

    for(Integer partition: offsetStart.keySet()) {
        job.getConfiguration().set(CONFIG_KAFKA_PARITION_START + partition, offsetStart.get(partition).toString());
        job.getConfiguration().set(CONFIG_KAFKA_PARITION_END + partition, offsetEnd.get(partition).toString());
    }

    job.setMapperClass(KafkaFlatTableMapper.class);
    job.setInputFormatClass(KafkaInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(0);
}
项目:Chi-FRBCS-BigData-Ave    文件:PartialBuilder.java   
@Override
protected void configureJob(Job job) throws IOException {
  Configuration conf = job.getConfiguration();

  job.setJarByClass(PartialBuilder.class);

  FileInputFormat.setInputPaths(job, getDataPath());
  FileOutputFormat.setOutputPath(job, getOutputPath(conf));

  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(MapredOutput.class);

  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(RuleBase.class);

  job.setMapperClass(ChiMapper.class);
  job.setReducerClass(ChiReducer.class);

  job.setNumReduceTasks(1);

  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
}
项目:Chi-FRBCS-BigData-Max    文件:MeanShiftCanopyDriver.java   
/**
 * Cluster the data using Hadoop
 */
private static void clusterDataMR(Path input, Path clustersIn, Path output)
    throws IOException, InterruptedException, ClassNotFoundException {
  Configuration conf = new Configuration();
  conf.set(STATE_IN_KEY, clustersIn.toString());
  Job job = new Job(conf,
      "Mean Shift Driver running clusterData over input: " + input);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(WeightedVectorWritable.class);
  job.setMapperClass(MeanShiftCanopyClusterMapper.class);

  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setNumReduceTasks(0);
  job.setJarByClass(MeanShiftCanopyDriver.class);

  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, output);

  if (!job.waitForCompletion(true)) {
    throw new InterruptedException(
        "Mean Shift Clustering failed on clustersIn " + clustersIn);
  }
}
项目:Chi-FRBCS-BigData-Ave    文件:UnitVectorizerJob.java   
public static void runJob(Path input, Path output)
  throws IOException, InterruptedException, ClassNotFoundException {

  Configuration conf = new Configuration();
  Job job = new Job(conf, "UnitVectorizerJob");

  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(VectorWritable.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setMapperClass(UnitVectorizerMapper.class);
  job.setNumReduceTasks(0);

  FileInputFormat.addInputPath(job, input);
  FileOutputFormat.setOutputPath(job, output);

  job.setJarByClass(UnitVectorizerJob.class);

  boolean succeeded = job.waitForCompletion(true);
  if (!succeeded) {
    throw new IllegalStateException("Job failed!");
  }
}
项目:Chi-FRBCS-BigDataCS    文件:MeanShiftCanopyDriver.java   
/**
 * Convert vectors to MeanShiftCanopies using Hadoop
 */
private static void createCanopyFromVectorsMR(Configuration conf, Path input,
    Path output, DistanceMeasure measure) throws IOException,
    InterruptedException, ClassNotFoundException {
  conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass()
      .getName());
  Job job = new Job(conf);
  job.setJarByClass(MeanShiftCanopyDriver.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(ClusterWritable.class);
  job.setMapperClass(MeanShiftCanopyCreatorMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, output);

  if (!job.waitForCompletion(true)) {
    throw new InterruptedException(
        "Mean Shift createCanopyFromVectorsMR failed on input " + input);
  }
}
项目:Chi-FRBCS-BigDataCS    文件:CVB0Driver.java   
private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
  throws IOException, ClassNotFoundException, InterruptedException {
  String jobName = "Calculating perplexity for " + modelPath;
  log.info("About to run: " + jobName);
  Job job = new Job(conf, jobName);
  job.setJarByClass(CachingCVB0PerplexityMapper.class);
  job.setMapperClass(CachingCVB0PerplexityMapper.class);
  job.setCombinerClass(DualDoubleSumReducer.class);
  job.setReducerClass(DualDoubleSumReducer.class);
  job.setNumReduceTasks(1);
  job.setOutputKeyClass(DoubleWritable.class);
  job.setOutputValueClass(DoubleWritable.class);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  FileInputFormat.addInputPath(job, corpusPath);
  Path outputPath = perplexityPath(modelPath.getParent(), iteration);
  FileOutputFormat.setOutputPath(job, outputPath);
  setModelPaths(job, modelPath);
  HadoopUtil.delete(conf, outputPath);
  if (!job.waitForCompletion(true)) {
    throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
  }
  return readPerplexity(conf, modelPath.getParent(), iteration);
}
项目:Chi-FRBCS-BigDataCS    文件:CVB0Driver.java   
private static Job writeTopicModel(Configuration conf, Path modelInput, Path output)
  throws IOException, InterruptedException, ClassNotFoundException {
  String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output);
  log.info("About to run: " + jobName);
  Job job = new Job(conf, jobName);
  job.setJarByClass(CVB0Driver.class);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setMapperClass(CVB0TopicTermVectorNormalizerMapper.class);
  job.setNumReduceTasks(0);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(VectorWritable.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  FileInputFormat.addInputPath(job, modelInput);
  FileOutputFormat.setOutputPath(job, output);
  job.submit();
  return job;
}
项目:Chi-FRBCS-BigDataCS    文件:CVB0Driver.java   
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
  throws IOException, ClassNotFoundException, InterruptedException {
  String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
  log.info("About to run: " + jobName);
  Job job = new Job(conf, jobName);
  job.setMapperClass(CVB0DocInferenceMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(VectorWritable.class);
  FileSystem fs = FileSystem.get(corpus.toUri(), conf);
  if (modelInput != null && fs.exists(modelInput)) {
    FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
    URI[] modelUris = new URI[statuses.length];
    for (int i = 0; i < statuses.length; i++) {
      modelUris[i] = statuses[i].getPath().toUri();
    }
    DistributedCache.setCacheFiles(modelUris, conf);
  }
  FileInputFormat.addInputPath(job, corpus);
  FileOutputFormat.setOutputPath(job, output);
  job.setJarByClass(CVB0Driver.class);
  job.submit();
  return job;
}
项目:Chi-FRBCS-BigData-Ave    文件:CVB0Driver.java   
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
  throws IOException, ClassNotFoundException, InterruptedException {
  String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
  log.info("About to run: " + jobName);
  Job job = new Job(conf, jobName);
  job.setMapperClass(CVB0DocInferenceMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(VectorWritable.class);
  FileSystem fs = FileSystem.get(corpus.toUri(), conf);
  if (modelInput != null && fs.exists(modelInput)) {
    FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
    URI[] modelUris = new URI[statuses.length];
    for (int i = 0; i < statuses.length; i++) {
      modelUris[i] = statuses[i].getPath().toUri();
    }
    DistributedCache.setCacheFiles(modelUris, conf);
  }
  FileInputFormat.addInputPath(job, corpus);
  FileOutputFormat.setOutputPath(job, output);
  job.setJarByClass(CVB0Driver.class);
  job.submit();
  return job;
}
项目:Chi-FRBCS-BigData-Ave    文件:SimpleTextEncodingVectorizer.java   
@Override
public void createVectors(Path input, Path output, VectorizerConfig config)
  throws IOException, ClassNotFoundException, InterruptedException {
  //do this for convenience of using prepareJob
  Job job = HadoopUtil.prepareJob(input, output,
                                  SequenceFileInputFormat.class,
                                  EncodingMapper.class,
                                  Text.class,
                                  VectorWritable.class,
                                  SequenceFileOutputFormat.class,
                                  config.getConf());
  Configuration conf = job.getConfiguration();
  conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess()));
  conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors()));
  conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName());
  conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName());
  conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass());
  conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality()));
  job.setNumReduceTasks(0);
  boolean finished = job.waitForCompletion(true);

  log.info("result of run: {}", finished);
  if (!finished) {
    throw new IllegalStateException("Job failed!");
  }
}
项目:Chi-FRBCS-BigData-Max    文件:ClusterOutputPostProcessorDriver.java   
/**
 * Process as a map reduce job. The numberOfReduceTasks is set to the number of clusters present in the
 * output. So that each cluster's vector is written in its own part file.
 * 
 * @param conf
 *          The hadoop configuration.
 * @param input
 *          The output path provided to the clustering algorithm, whose would be post processed. Hint : The
 *          path of the directory containing clusters-*-final and clusteredPoints.
 * @param output
 *          The post processed data would be stored at this path.
 */
private static void postProcessMR(Configuration conf, Path input, Path output) throws IOException,
                                                                              InterruptedException,
                                                                              ClassNotFoundException {
  Job job = new Job(conf, "ClusterOutputPostProcessor Driver running over input: " + input);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setMapperClass(ClusterOutputPostProcessorMapper.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(VectorWritable.class);
  job.setReducerClass(ClusterOutputPostProcessorReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(VectorWritable.class);
  int numberOfClusters = ClusterCountReader.getNumberOfClusters(input, conf);
  job.setNumReduceTasks(numberOfClusters);
  job.setJarByClass(ClusterOutputPostProcessorDriver.class);

  FileInputFormat.addInputPath(job, new Path(input, new Path("clusteredPoints")));
  FileOutputFormat.setOutputPath(job, output);
  if (!job.waitForCompletion(true)) {
    throw new InterruptedException("ClusterOutputPostProcessor Job failed processing " + input);
  }
}
项目:ignite    文件:HadoopWordCount2.java   
/**
 * Sets task classes with related info if needed into configuration object.
 *
 * @param job Configuration to change.
 * @param setMapper Option to set mapper and input format classes.
 * @param setCombiner Option to set combiner class.
 * @param setReducer Option to set reducer and output format classes.
 */
public static void setTasksClasses(Job job, boolean setMapper, boolean setCombiner, boolean setReducer,
        boolean outputCompression) {
    if (setMapper) {
        job.setMapperClass(HadoopWordCount2Mapper.class);
        job.setInputFormatClass(TextInputFormat.class);
    }

    if (setCombiner)
        job.setCombinerClass(HadoopWordCount2Combiner.class);

    if (setReducer) {
        job.setReducerClass(HadoopWordCount2Reducer.class);
        job.setOutputFormatClass(TextOutputFormat.class);
    }

    if (outputCompression) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        SequenceFileOutputFormat.setCompressOutput(job, true);

        job.getConfiguration().set(FileOutputFormat.COMPRESS_CODEC, SnappyCodec.class.getName());
    }
}
项目:Chi-FRBCS-BigData-Ave    文件:CVB0Driver.java   
public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput,
                                int iterationNumber, int maxIterations, int numReduceTasks)
  throws IOException, ClassNotFoundException, InterruptedException {
  String jobName = String.format("Iteration %d of %d, input path: %s",
      iterationNumber, maxIterations, modelInput);
  log.info("About to run: " + jobName);
  Job job = new Job(conf, jobName);
  job.setJarByClass(CVB0Driver.class);
  job.setMapperClass(CachingCVB0Mapper.class);
  job.setCombinerClass(VectorSumReducer.class);
  job.setReducerClass(VectorSumReducer.class);
  job.setNumReduceTasks(numReduceTasks);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(VectorWritable.class);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  FileInputFormat.addInputPath(job, corpusInput);
  FileOutputFormat.setOutputPath(job, modelOutput);
  setModelPaths(job, modelInput);
  HadoopUtil.delete(conf, modelOutput);
  if (!job.waitForCompletion(true)) {
    throw new InterruptedException(String.format("Failed to complete iteration %d stage 1",
        iterationNumber));
  }
}
项目:openimaj    文件:TextBytesJobUtil.java   
public static Job createJob(Path[] inputPaths, Path outputPath, Map<String, String> metadata, Configuration config)
        throws IOException
{
    final Job job = new Job(config);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setOutputFormatClass(MetadataSequenceFileOutputFormat.class);

    SequenceFileInputFormat.setInputPaths(job, inputPaths);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    if (metadata != null)
        MetadataConfiguration.setMetadata(metadata, job.getConfiguration());

    return job;
}
项目:Chi-FRBCS-BigData-Max    文件:SimpleTextEncodingVectorizer.java   
@Override
public void createVectors(Path input, Path output, VectorizerConfig config)
  throws IOException, ClassNotFoundException, InterruptedException {
  //do this for convenience of using prepareJob
  Job job = HadoopUtil.prepareJob(input, output,
                                  SequenceFileInputFormat.class,
                                  EncodingMapper.class,
                                  Text.class,
                                  VectorWritable.class,
                                  SequenceFileOutputFormat.class,
                                  config.getConf());
  Configuration conf = job.getConfiguration();
  conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess()));
  conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors()));
  conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName());
  conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName());
  conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass());
  conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality()));
  job.setNumReduceTasks(0);
  boolean finished = job.waitForCompletion(true);

  log.info("result of run: {}", finished);
  if (!finished) {
    throw new IllegalStateException("Job failed!");
  }
}
项目:vdn-log-thinker    文件:SortGroupResultPreprocessor.java   
@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n",
                getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }

    Job job = Job.getInstance(getConf());
    job.setMapperClass(SortGroupResultMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    // Sort data by total number:
    job.setOutputFormatClass(SequenceFileOutputFormat.class);


    return job.waitForCompletion(true) ? 0 : 1;
}
项目:pbase    文件:Export.java   
/**
 * Sets up the actual job.
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
  String tableName = args[0];
  Path outputDir = new Path(args[1]);
  Job job = new Job(conf, NAME + "_" + tableName);
  job.setJobName(NAME + "_" + tableName);
  job.setJarByClass(Export.class);
  // Set optional scan parameters
  Scan s = getConfiguredScanForJob(conf, args);
  IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
  // No reducers.  Just write straight to output files.
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Result.class);
  FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
  return job;
}
项目:cloudera-homework    文件:CreateSequenceFile.java   
@Override
public int run(String[] args) throws Exception {

  if (args.length != 2) {
    System.out.printf("Usage: CreateSequenceFile <input dir> <output dir>\n");
    return -1;
  }

  Job job = new Job(getConf());
  job.setJarByClass(CreateSequenceFile.class);
  job.setJobName("Create Sequence File");

  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  FileInputFormat.setInputPaths(job, new Path(args[0]));
  SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));

  FileOutputFormat.setCompressOutput(job,true);
  FileOutputFormat.setOutputCompressorClass(job,SnappyCodec.class);

  SequenceFileOutputFormat.setOutputCompressionType(job,
  CompressionType.BLOCK);
  boolean success = job.waitForCompletion(true);
  return success ? 0 : 1;
}