Java 类org.apache.hadoop.mapred.InputFormat 实例源码

项目:ditb    文件:TestTableInputFormat.java   
void testInputFormat(Class<? extends InputFormat> clazz) throws IOException {
  final JobConf job = MapreduceTestingShim.getJobConf(mrCluster);
  job.setInputFormat(clazz);
  job.setOutputFormat(NullOutputFormat.class);
  job.setMapperClass(ExampleVerifier.class);
  job.setNumReduceTasks(0);
  LOG.debug("submitting job.");
  final RunningJob run = JobClient.runJob(job);
  assertTrue("job failed!", run.isSuccessful());
  assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter());
  assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter());
  assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter());
  assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter());
  assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter());
  assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter());
}
项目:hadoop    文件:TestMultipleInputs.java   
public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:hadoop    文件:InputSampler.java   
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:hadoop    文件:InputSampler.java   
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:ditb    文件:TableMapReduceUtil.java   
/**
 * @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job)
 */
public static void addDependencyJars(JobConf job) throws IOException {
  org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
  org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJars(
    job,
    // when making changes here, consider also mapreduce.TableMapReduceUtil
    // pull job classes
    job.getMapOutputKeyClass(),
    job.getMapOutputValueClass(),
    job.getOutputKeyClass(),
    job.getOutputValueClass(),
    job.getPartitionerClass(),
    job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
    job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
    job.getCombinerClass());
}
项目:aliyun-oss-hadoop-fs    文件:TestMultipleInputs.java   
public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:aliyun-oss-hadoop-fs    文件:InputSampler.java   
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:aliyun-oss-hadoop-fs    文件:InputSampler.java   
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:big-c    文件:TestMultipleInputs.java   
public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:big-c    文件:InputSampler.java   
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:big-c    文件:InputSampler.java   
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:hive-phoenix-handler    文件:HiveInputFormat.java   
public static InputFormat<WritableComparable, Writable> getInputFormatFromCache(
  Class inputFormatClass, JobConf job) throws IOException {
  InputFormat<WritableComparable, Writable> instance = inputFormats.get(inputFormatClass);
  if (instance == null) {
    try {
      instance = (InputFormat<WritableComparable, Writable>) ReflectionUtil
          .newInstance(inputFormatClass, job);
      // HBase input formats are not thread safe today. See HIVE-8808.
      String inputFormatName = inputFormatClass.getName().toLowerCase();
      if (!inputFormatName.contains("hbase")) {
        inputFormats.put(inputFormatClass, instance);
      }
    } catch (Exception e) {
      throw new IOException("Cannot create an instance of InputFormat class "
          + inputFormatClass.getName() + " as specified in mapredWork!", e);
    }
  }
  return instance;
}
项目:drill    文件:SequenceFileRecordReader.java   
private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> getRecordReader(
  final InputFormat<BytesWritable, BytesWritable> inputFormat,
  final JobConf jobConf) throws ExecutionSetupException {
  try {
    final UserGroupInformation ugi = ImpersonationUtil.createProxyUgi(this.opUserName, this.queryUserName);
    return ugi.doAs(new PrivilegedExceptionAction<org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable>>() {
      @Override
      public org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> run() throws Exception {
        return inputFormat.getRecordReader(split, jobConf, Reporter.NULL);
      }
    });
  } catch (IOException | InterruptedException e) {
    throw new ExecutionSetupException(
      String.format("Error in creating sequencefile reader for file: %s, start: %d, length: %d",
        split.getPath(), split.getStart(), split.getLength()), e);
  }
}
项目:drill    文件:ConvertHiveParquetScanToDrillParquetScan.java   
/**
 * Get the input format from given {@link StorageDescriptor}
 * @param properties
 * @param hiveReadEntry
 * @param sd
 * @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning.
 */
private Class<? extends InputFormat<?, ?>> getInputFormatFromSD(final Properties properties,
    final HiveReadEntry hiveReadEntry, final StorageDescriptor sd, final HiveConf hiveConf) {
  final Table hiveTable = hiveReadEntry.getTable();
  try {
    final String inputFormatName = sd.getInputFormat();
    if (!Strings.isNullOrEmpty(inputFormatName)) {
      return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName);
    }

    final JobConf job = new JobConf(hiveConf);
    HiveUtilities.addConfToJob(job, properties);
    return HiveUtilities.getInputFormatClass(job, sd, hiveTable);
  } catch (final Exception e) {
    logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]",
        hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e);
    return null;
  }
}
项目:drill    文件:HiveUtilities.java   
/**
 * Utility method which gets table or partition {@link InputFormat} class. First it
 * tries to get the class name from given StorageDescriptor object. If it doesn't contain it tries to get it from
 * StorageHandler class set in table properties. If not found throws an exception.
 * @param job {@link JobConf} instance needed incase the table is StorageHandler based table.
 * @param sd {@link StorageDescriptor} instance of currently reading partition or table (for non-partitioned tables).
 * @param table Table object
 * @throws Exception
 */
public static Class<? extends InputFormat<?, ?>> getInputFormatClass(final JobConf job, final StorageDescriptor sd,
    final Table table) throws Exception {
  final String inputFormatName = sd.getInputFormat();
  if (Strings.isNullOrEmpty(inputFormatName)) {
    final String storageHandlerClass = table.getParameters().get(META_TABLE_STORAGE);
    if (Strings.isNullOrEmpty(storageHandlerClass)) {
      throw new ExecutionSetupException("Unable to get Hive table InputFormat class. There is neither " +
          "InputFormat class explicitly specified nor StorageHandler class");
    }
    final HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(job, storageHandlerClass);
    return (Class<? extends InputFormat<?, ?>>) storageHandler.getInputFormatClass();
  } else {
    return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName) ;
  }
}
项目:hadoop-etl-udfs    文件:HdfsSerDeImportServiceTest.java   
private void runImportRCFile(ExaIterator ctx, List<HCatTableColumn> columns, List<HCatTableColumn> partitionColumns, List<OutputColumnSpec> outputColumns, String file) throws Exception {
    List<HCatSerDeParameter> serDeParameters = new ArrayList<>();
    serDeParameters.add(new HCatSerDeParameter("serialization.format", "1"));

    String inputFormatClassName = "org.apache.hadoop.hive.ql.io.RCFileInputFormat";
    String serDeClassName = "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe";
    String hdfsUser = "hdfs";
    boolean useKerberos = false;

    List<String> hdfsServers = new ArrayList<>();
    hdfsServers.add("file:///");
    final Configuration conf = new Configuration();
    FileSystem fs = HdfsService.getFileSystem(hdfsServers,conf);

    InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) UdfUtils.getInstanceByName(inputFormatClassName);
    SerDe serDe = (SerDe) UdfUtils.getInstanceByName(serDeClassName);
    HdfsSerDeImportService.importFile(fs, file, partitionColumns, inputFormat, serDe, serDeParameters, hdfsServers, hdfsUser, columns, outputColumns, useKerberos, ctx);
}
项目:presto    文件:HiveFileIterator.java   
public HiveFileIterator(
        Path path,
        FileSystem fileSystem,
        DirectoryLister directoryLister,
        NamenodeStats namenodeStats,
        String partitionName,
        InputFormat<?, ?> inputFormat,
        Properties schema,
        List<HivePartitionKey> partitionKeys,
        TupleDomain<HiveColumnHandle> effectivePredicate)
{
    this.partitionName = requireNonNull(partitionName, "partitionName is null");
    this.inputFormat = requireNonNull(inputFormat, "inputFormat is null");
    this.schema = requireNonNull(schema, "schema is null");
    this.partitionKeys = requireNonNull(partitionKeys, "partitionKeys is null");
    this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null");
    this.path = requireNonNull(path, "path is null");
    this.fileSystem = requireNonNull(fileSystem, "fileSystem is null");
    this.directoryLister = requireNonNull(directoryLister, "directoryLister is null");
    this.namenodeStats = requireNonNull(namenodeStats, "namenodeStats is null");
}
项目:hadoop-2.6.0-cdh5.4.3    文件:InputSampler.java   
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:hadoop-2.6.0-cdh5.4.3    文件:InputSampler.java   
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:FlexMap    文件:InputSampler.java   
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:hadoop-2.6.0-cdh5.4.3    文件:InputSampler.java   
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:hops    文件:TestMultipleInputs.java   
@Test
public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:hadoop-2.6.0-cdh5.4.3    文件:TestMultipleInputs.java   
public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:presto    文件:TestHiveFileFormats.java   
@Test
public void testParquet()
        throws Exception
{
    List<TestColumn> testColumns = getTestColumnsSupportedByParquet();

    HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
    InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
    @SuppressWarnings("deprecation")
    SerDe serde = new ParquetHiveSerDe();
    File file = File.createTempFile("presto_test", "parquet");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
        HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false);
        testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}
项目:hadoop-EAR    文件:InputSampler.java   
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:hadoop-EAR    文件:DistCp.java   
private static JobConf createJobConf(Configuration conf, boolean useFastCopy) {
  Class<? extends InputFormat> inputFormat =
    (useFastCopy) ? FastCopyInputFormat.class : CopyInputFormat.class;
  JobConf jobconf = new JobConf(conf, DistCp.class);
  jobconf.setJobName(NAME);

  // turn off speculative execution, because DFS doesn't handle
  // multiple writers to the same file.
  jobconf.setReduceSpeculativeExecution(false);
  jobconf.setMapOutputKeyClass(FilePairComparable.class);
  jobconf.setMapOutputValueClass(Text.class);
  jobconf.setOutputKeyClass(FilePairComparable.class);
  jobconf.setOutputValueClass(Text.class);

  jobconf.setInputFormat(inputFormat);
  jobconf.setMapperClass(CopyFilesTask.class);
  jobconf.setReducerClass(CopyFilesTask.class);

  // Prevent the reducer from starting until all maps are done.
  jobconf.setInt("mapred.job.rushreduce.reduce.threshold", 0);
  jobconf.setFloat("mapred.reduce.slowstart.completed.maps", 1.0f);

  return jobconf;
}
项目:hadoop-plus    文件:TestMultipleInputs.java   
public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:presto    文件:TestHiveFileFormats.java   
@Test
public void testRCBinary()
        throws Exception
{
    List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> {
        // RC file does not support complex type as key of a map
        return !testColumn.getName().equals("t_map_null_key_complex_key_value");
    }));

    HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
    InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
    @SuppressWarnings("deprecation")
    SerDe serde = new LazyBinaryColumnarSerDe();
    File file = File.createTempFile("presto_test", "rc-binary");
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
        testCursorProvider(new ColumnarBinaryHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
        testCursorProvider(new GenericHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}
项目:hadoop-plus    文件:InputSampler.java   
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
项目:presto    文件:TestHiveFileFormats.java   
@Test
public void testParquetUseColumnNames()
        throws Exception
{
    List<TestColumn> testColumns = getTestColumnsSupportedByParquet();

    HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
    InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
    @SuppressWarnings("deprecation")
    SerDe serde = new ParquetHiveSerDe();
    File file = File.createTempFile("presto_test", "parquet");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
        // Reverse the order of the columns to test access by name, not by index
        Collections.reverse(testColumns);
        HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(true);
        testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}
项目:presto    文件:TestHiveFileFormats.java   
@Test
public void testOrcDataStream()
        throws Exception
{
    HiveOutputFormat<?, ?> outputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat();
    InputFormat<?, ?> inputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcInputFormat();
    @SuppressWarnings("deprecation")
    SerDe serde = new org.apache.hadoop.hive.ql.io.orc.OrcSerde();
    File file = File.createTempFile("presto_test", "orc");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
        testPageSourceFactory(new OrcPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}
项目:hazelcast-jet    文件:ReadHdfsP.java   
@Override
public void init(@Nonnull Context context) {
    logger = context.jetInstance().getHazelcastInstance().getLoggingService().getLogger(ReadHdfsP.class);
    try {
        int totalParallelism = context.totalParallelism();
        InputFormat inputFormat = jobConf.getInputFormat();
        InputSplit[] splits = inputFormat.getSplits(jobConf, totalParallelism);
        IndexedInputSplit[] indexedInputSplits = new IndexedInputSplit[splits.length];
        Arrays.setAll(indexedInputSplits, i -> new IndexedInputSplit(i, splits[i]));

        Address[] addrs = context.jetInstance().getCluster().getMembers()
                .stream().map(Member::getAddress).toArray(Address[]::new);
        assigned = assignSplitsToMembers(indexedInputSplits, addrs);
        printAssignments(assigned);
    } catch (IOException e) {
        throw rethrow(e);
    }
}
项目:hazelcast-jet    文件:ReadHdfsP.java   
@Override
@Nonnull
public List<Processor> get(int count) {
    Map<Integer, List<IndexedInputSplit>> processorToSplits =
            range(0, assignedSplits.size()).mapToObj(i -> new SimpleImmutableEntry<>(i, assignedSplits.get(i)))
                    .collect(groupingBy(e -> e.getKey() % count,
                            mapping(Entry::getValue, toList())));
    range(0, count)
            .forEach(processor -> processorToSplits.computeIfAbsent(processor, x -> emptyList()));
    InputFormat inputFormat = jobConf.getInputFormat();

    return processorToSplits
            .values().stream()
            .map(splits -> splits.isEmpty()
                    ? Processors.noopP().get()
                    : new ReadHdfsP<>(splits.stream()
                    .map(IndexedInputSplit::getSplit)
                    .map(split -> uncheckCall(() ->
                            inputFormat.getRecordReader(split, jobConf, NULL)))
                    .collect(toList()), mapper)
            ).collect(toList());
}
项目:presto    文件:HiveUtil.java   
static InputFormat<?, ?> getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget)
{
    String inputFormatName = getInputFormatName(schema);
    try {
        JobConf jobConf = new JobConf(configuration);

        Class<? extends InputFormat<?, ?>> inputFormatClass = getInputFormatClass(jobConf, inputFormatName);
        if (symlinkTarget && (inputFormatClass == SymlinkTextInputFormat.class)) {
            // symlink targets are always TextInputFormat
            inputFormatClass = TextInputFormat.class;
        }

        return ReflectionUtils.newInstance(inputFormatClass, jobConf);
    }
    catch (ClassNotFoundException | RuntimeException e) {
        throw new RuntimeException("Unable to create input format " + inputFormatName, e);
    }
}
项目:pbase    文件:TableMapReduceUtil.java   
/**
 * @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job)
 */
public static void addDependencyJars(JobConf job) throws IOException {
  org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
  org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJars(
    job,
    // when making changes here, consider also mapreduce.TableMapReduceUtil
    // pull job classes
    job.getMapOutputKeyClass(),
    job.getMapOutputValueClass(),
    job.getOutputKeyClass(),
    job.getOutputValueClass(),
    job.getPartitionerClass(),
    job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
    job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
    job.getCombinerClass());
}
项目:FlexMap    文件:TestMultipleInputs.java   
public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:QDrill    文件:ConvertHiveParquetScanToDrillParquetScan.java   
/**
 * Rule is matched when all of the following match:
 * 1) GroupScan in given DrillScalRel is an {@link HiveScan}
 * 2) {@link HiveScan} is not already rewritten using Drill's native readers
 * 3) InputFormat in Hive table metadata and all partitions metadata contains the same value
 *    {@link MapredParquetInputFormat}
 * 4) No error occurred while checking for the above conditions. An error is logged as warning.
 *
 * @param call
 * @return True if the rule can be applied. False otherwise
 */
@Override
public boolean matches(RelOptRuleCall call) {
  final DrillScanRel scanRel = (DrillScanRel) call.rel(0);
  final PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner());

  if (!(scanRel.getGroupScan() instanceof HiveScan) || ((HiveScan) scanRel.getGroupScan()).isNativeReader()) {
    return false;
  }

  final HiveScan hiveScan = (HiveScan) scanRel.getGroupScan();
  final Table hiveTable = hiveScan.hiveReadEntry.getTable();

  final Class<? extends InputFormat> tableInputFormat = getInputFormatFromSD(hiveTable, hiveTable.getSd());
  if (tableInputFormat == null || !tableInputFormat.equals(MapredParquetInputFormat.class)) {
    return false;
  }

  final List<HivePartition> partitions = hiveScan.hiveReadEntry.getHivePartitionWrappers();
  if (partitions == null) {
    return true;
  }

  // Make sure all partitions have the same input format as the table input format
  for (HivePartition partition : partitions) {
    Class<? extends InputFormat> inputFormat = getInputFormatFromSD(hiveTable, partition.getPartition().getSd());
    if (inputFormat == null || !inputFormat.equals(tableInputFormat)) {
      return false;
    }
  }

  return true;
}
项目:QDrill    文件:ConvertHiveParquetScanToDrillParquetScan.java   
/**
 * Get the input format from given {@link StorageDescriptor}
 * @param hiveTable
 * @param sd
 * @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning.
 */
private Class<? extends InputFormat> getInputFormatFromSD(final Table hiveTable, final StorageDescriptor sd) {
  try {
    return (Class<? extends InputFormat>) Class.forName(sd.getInputFormat());
  } catch (ReflectiveOperationException e) {
    logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]",
        hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e);
    return null;
  }
}
项目:QDrill    文件:HiveScan.java   
private void splitInput(final Properties properties, final StorageDescriptor sd, final Partition partition)
    throws ReflectiveOperationException, IOException {
  final JobConf job = new JobConf();
  for (final Object obj : properties.keySet()) {
    job.set((String) obj, (String) properties.get(obj));
  }
  for (final Map.Entry<String, String> entry : hiveReadEntry.hiveConfigOverride.entrySet()) {
    job.set(entry.getKey(), entry.getValue());
  }
  InputFormat<?, ?> format = (InputFormat<?, ?>)
      Class.forName(sd.getInputFormat()).getConstructor().newInstance();
  job.setInputFormat(format.getClass());
  final Path path = new Path(sd.getLocation());
  final FileSystem fs = path.getFileSystem(job);

  if (fs.exists(path)) {
    FileInputFormat.addInputPath(job, path);
    format = job.getInputFormat();
    for (final InputSplit split : format.getSplits(job, 1)) {
      inputSplits.add(split);
      partitionMap.put(split, partition);
    }
  }
  final String numRowsProp = properties.getProperty("numRows");
  logger.trace("HiveScan num rows property = {}", numRowsProp);
  if (numRowsProp != null) {
    final long numRows = Long.valueOf(numRowsProp);
    // starting from hive-0.13, when no statistics are available, this property is set to -1
    // it's important to note that the value returned by hive may not be up to date
    if (numRows > 0) {
      rowCount += numRows;
    }
  }
}
项目:hadoop    文件:TestMultipleInputs.java   
public void testAddInputPathWithFormat() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
      KeyValueTextInputFormat.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
}