void testInputFormat(Class<? extends InputFormat> clazz) throws IOException { final JobConf job = MapreduceTestingShim.getJobConf(mrCluster); job.setInputFormat(clazz); job.setOutputFormat(NullOutputFormat.class); job.setMapperClass(ExampleVerifier.class); job.setNumReduceTasks(0); LOG.debug("submitting job."); final RunningJob run = JobClient.runJob(job); assertTrue("job failed!", run.isSuccessful()); assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter()); assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter()); assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter()); assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter()); assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter()); assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters() .findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter()); }
public void testAddInputPathWithMapper() { final JobConf conf = new JobConf(); MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class, MapClass.class); MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class, MapClass2.class); final Map<Path, InputFormat> inputs = MultipleInputs .getInputFormatMap(conf); final Map<Path, Class<? extends Mapper>> maps = MultipleInputs .getMapperTypeMap(conf); assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass()); assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")) .getClass()); assertEquals(MapClass.class, maps.get(new Path("/foo"))); assertEquals(MapClass2.class, maps.get(new Path("/bar"))); }
/** * From each split sampled, take the first numSamples / numSplits records. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { samples.add(key); key = reader.createKey(); ++records; if ((i+1) * samplesPerSplit <= records) { break; } } reader.close(); } return (K[])samples.toArray(); }
/** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(key); key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
/** * @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job) */ public static void addDependencyJars(JobConf job) throws IOException { org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job); org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJars( job, // when making changes here, consider also mapreduce.TableMapReduceUtil // pull job classes job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getOutputKeyClass(), job.getOutputValueClass(), job.getPartitionerClass(), job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class), job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class), job.getCombinerClass()); }
public static InputFormat<WritableComparable, Writable> getInputFormatFromCache( Class inputFormatClass, JobConf job) throws IOException { InputFormat<WritableComparable, Writable> instance = inputFormats.get(inputFormatClass); if (instance == null) { try { instance = (InputFormat<WritableComparable, Writable>) ReflectionUtil .newInstance(inputFormatClass, job); // HBase input formats are not thread safe today. See HIVE-8808. String inputFormatName = inputFormatClass.getName().toLowerCase(); if (!inputFormatName.contains("hbase")) { inputFormats.put(inputFormatClass, instance); } } catch (Exception e) { throw new IOException("Cannot create an instance of InputFormat class " + inputFormatClass.getName() + " as specified in mapredWork!", e); } } return instance; }
private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> getRecordReader( final InputFormat<BytesWritable, BytesWritable> inputFormat, final JobConf jobConf) throws ExecutionSetupException { try { final UserGroupInformation ugi = ImpersonationUtil.createProxyUgi(this.opUserName, this.queryUserName); return ugi.doAs(new PrivilegedExceptionAction<org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable>>() { @Override public org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> run() throws Exception { return inputFormat.getRecordReader(split, jobConf, Reporter.NULL); } }); } catch (IOException | InterruptedException e) { throw new ExecutionSetupException( String.format("Error in creating sequencefile reader for file: %s, start: %d, length: %d", split.getPath(), split.getStart(), split.getLength()), e); } }
/** * Get the input format from given {@link StorageDescriptor} * @param properties * @param hiveReadEntry * @param sd * @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning. */ private Class<? extends InputFormat<?, ?>> getInputFormatFromSD(final Properties properties, final HiveReadEntry hiveReadEntry, final StorageDescriptor sd, final HiveConf hiveConf) { final Table hiveTable = hiveReadEntry.getTable(); try { final String inputFormatName = sd.getInputFormat(); if (!Strings.isNullOrEmpty(inputFormatName)) { return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName); } final JobConf job = new JobConf(hiveConf); HiveUtilities.addConfToJob(job, properties); return HiveUtilities.getInputFormatClass(job, sd, hiveTable); } catch (final Exception e) { logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]", hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e); return null; } }
/** * Utility method which gets table or partition {@link InputFormat} class. First it * tries to get the class name from given StorageDescriptor object. If it doesn't contain it tries to get it from * StorageHandler class set in table properties. If not found throws an exception. * @param job {@link JobConf} instance needed incase the table is StorageHandler based table. * @param sd {@link StorageDescriptor} instance of currently reading partition or table (for non-partitioned tables). * @param table Table object * @throws Exception */ public static Class<? extends InputFormat<?, ?>> getInputFormatClass(final JobConf job, final StorageDescriptor sd, final Table table) throws Exception { final String inputFormatName = sd.getInputFormat(); if (Strings.isNullOrEmpty(inputFormatName)) { final String storageHandlerClass = table.getParameters().get(META_TABLE_STORAGE); if (Strings.isNullOrEmpty(storageHandlerClass)) { throw new ExecutionSetupException("Unable to get Hive table InputFormat class. There is neither " + "InputFormat class explicitly specified nor StorageHandler class"); } final HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(job, storageHandlerClass); return (Class<? extends InputFormat<?, ?>>) storageHandler.getInputFormatClass(); } else { return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName) ; } }
private void runImportRCFile(ExaIterator ctx, List<HCatTableColumn> columns, List<HCatTableColumn> partitionColumns, List<OutputColumnSpec> outputColumns, String file) throws Exception { List<HCatSerDeParameter> serDeParameters = new ArrayList<>(); serDeParameters.add(new HCatSerDeParameter("serialization.format", "1")); String inputFormatClassName = "org.apache.hadoop.hive.ql.io.RCFileInputFormat"; String serDeClassName = "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"; String hdfsUser = "hdfs"; boolean useKerberos = false; List<String> hdfsServers = new ArrayList<>(); hdfsServers.add("file:///"); final Configuration conf = new Configuration(); FileSystem fs = HdfsService.getFileSystem(hdfsServers,conf); InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) UdfUtils.getInstanceByName(inputFormatClassName); SerDe serDe = (SerDe) UdfUtils.getInstanceByName(serDeClassName); HdfsSerDeImportService.importFile(fs, file, partitionColumns, inputFormat, serDe, serDeParameters, hdfsServers, hdfsUser, columns, outputColumns, useKerberos, ctx); }
public HiveFileIterator( Path path, FileSystem fileSystem, DirectoryLister directoryLister, NamenodeStats namenodeStats, String partitionName, InputFormat<?, ?> inputFormat, Properties schema, List<HivePartitionKey> partitionKeys, TupleDomain<HiveColumnHandle> effectivePredicate) { this.partitionName = requireNonNull(partitionName, "partitionName is null"); this.inputFormat = requireNonNull(inputFormat, "inputFormat is null"); this.schema = requireNonNull(schema, "schema is null"); this.partitionKeys = requireNonNull(partitionKeys, "partitionKeys is null"); this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null"); this.path = requireNonNull(path, "path is null"); this.fileSystem = requireNonNull(fileSystem, "fileSystem is null"); this.directoryLister = requireNonNull(directoryLister, "directoryLister is null"); this.namenodeStats = requireNonNull(namenodeStats, "namenodeStats is null"); }
@Test public void testAddInputPathWithMapper() { final JobConf conf = new JobConf(); MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class, MapClass.class); MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class, MapClass2.class); final Map<Path, InputFormat> inputs = MultipleInputs .getInputFormatMap(conf); final Map<Path, Class<? extends Mapper>> maps = MultipleInputs .getMapperTypeMap(conf); assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass()); assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")) .getClass()); assertEquals(MapClass.class, maps.get(new Path("/foo"))); assertEquals(MapClass2.class, maps.get(new Path("/bar"))); }
@Test public void testParquet() throws Exception { List<TestColumn> testColumns = getTestColumnsSupportedByParquet(); HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat(); InputFormat<?, ?> inputFormat = new MapredParquetInputFormat(); @SuppressWarnings("deprecation") SerDe serde = new ParquetHiveSerDe(); File file = File.createTempFile("presto_test", "parquet"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS); HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false); testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
private static JobConf createJobConf(Configuration conf, boolean useFastCopy) { Class<? extends InputFormat> inputFormat = (useFastCopy) ? FastCopyInputFormat.class : CopyInputFormat.class; JobConf jobconf = new JobConf(conf, DistCp.class); jobconf.setJobName(NAME); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobconf.setReduceSpeculativeExecution(false); jobconf.setMapOutputKeyClass(FilePairComparable.class); jobconf.setMapOutputValueClass(Text.class); jobconf.setOutputKeyClass(FilePairComparable.class); jobconf.setOutputValueClass(Text.class); jobconf.setInputFormat(inputFormat); jobconf.setMapperClass(CopyFilesTask.class); jobconf.setReducerClass(CopyFilesTask.class); // Prevent the reducer from starting until all maps are done. jobconf.setInt("mapred.job.rushreduce.reduce.threshold", 0); jobconf.setFloat("mapred.reduce.slowstart.completed.maps", 1.0f); return jobconf; }
@Test public void testRCBinary() throws Exception { List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> { // RC file does not support complex type as key of a map return !testColumn.getName().equals("t_map_null_key_complex_key_value"); })); HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat(); InputFormat<?, ?> inputFormat = new RCFileInputFormat<>(); @SuppressWarnings("deprecation") SerDe serde = new LazyBinaryColumnarSerDe(); File file = File.createTempFile("presto_test", "rc-binary"); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS); testCursorProvider(new ColumnarBinaryHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS); testCursorProvider(new GenericHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
@Test public void testParquetUseColumnNames() throws Exception { List<TestColumn> testColumns = getTestColumnsSupportedByParquet(); HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat(); InputFormat<?, ?> inputFormat = new MapredParquetInputFormat(); @SuppressWarnings("deprecation") SerDe serde = new ParquetHiveSerDe(); File file = File.createTempFile("presto_test", "parquet"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS); // Reverse the order of the columns to test access by name, not by index Collections.reverse(testColumns); HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(true); testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
@Test public void testOrcDataStream() throws Exception { HiveOutputFormat<?, ?> outputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat(); InputFormat<?, ?> inputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcInputFormat(); @SuppressWarnings("deprecation") SerDe serde = new org.apache.hadoop.hive.ql.io.orc.OrcSerde(); File file = File.createTempFile("presto_test", "orc"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS); testPageSourceFactory(new OrcPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
@Override public void init(@Nonnull Context context) { logger = context.jetInstance().getHazelcastInstance().getLoggingService().getLogger(ReadHdfsP.class); try { int totalParallelism = context.totalParallelism(); InputFormat inputFormat = jobConf.getInputFormat(); InputSplit[] splits = inputFormat.getSplits(jobConf, totalParallelism); IndexedInputSplit[] indexedInputSplits = new IndexedInputSplit[splits.length]; Arrays.setAll(indexedInputSplits, i -> new IndexedInputSplit(i, splits[i])); Address[] addrs = context.jetInstance().getCluster().getMembers() .stream().map(Member::getAddress).toArray(Address[]::new); assigned = assignSplitsToMembers(indexedInputSplits, addrs); printAssignments(assigned); } catch (IOException e) { throw rethrow(e); } }
@Override @Nonnull public List<Processor> get(int count) { Map<Integer, List<IndexedInputSplit>> processorToSplits = range(0, assignedSplits.size()).mapToObj(i -> new SimpleImmutableEntry<>(i, assignedSplits.get(i))) .collect(groupingBy(e -> e.getKey() % count, mapping(Entry::getValue, toList()))); range(0, count) .forEach(processor -> processorToSplits.computeIfAbsent(processor, x -> emptyList())); InputFormat inputFormat = jobConf.getInputFormat(); return processorToSplits .values().stream() .map(splits -> splits.isEmpty() ? Processors.noopP().get() : new ReadHdfsP<>(splits.stream() .map(IndexedInputSplit::getSplit) .map(split -> uncheckCall(() -> inputFormat.getRecordReader(split, jobConf, NULL))) .collect(toList()), mapper) ).collect(toList()); }
static InputFormat<?, ?> getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget) { String inputFormatName = getInputFormatName(schema); try { JobConf jobConf = new JobConf(configuration); Class<? extends InputFormat<?, ?>> inputFormatClass = getInputFormatClass(jobConf, inputFormatName); if (symlinkTarget && (inputFormatClass == SymlinkTextInputFormat.class)) { // symlink targets are always TextInputFormat inputFormatClass = TextInputFormat.class; } return ReflectionUtils.newInstance(inputFormatClass, jobConf); } catch (ClassNotFoundException | RuntimeException e) { throw new RuntimeException("Unable to create input format " + inputFormatName, e); } }
/** * Rule is matched when all of the following match: * 1) GroupScan in given DrillScalRel is an {@link HiveScan} * 2) {@link HiveScan} is not already rewritten using Drill's native readers * 3) InputFormat in Hive table metadata and all partitions metadata contains the same value * {@link MapredParquetInputFormat} * 4) No error occurred while checking for the above conditions. An error is logged as warning. * * @param call * @return True if the rule can be applied. False otherwise */ @Override public boolean matches(RelOptRuleCall call) { final DrillScanRel scanRel = (DrillScanRel) call.rel(0); final PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner()); if (!(scanRel.getGroupScan() instanceof HiveScan) || ((HiveScan) scanRel.getGroupScan()).isNativeReader()) { return false; } final HiveScan hiveScan = (HiveScan) scanRel.getGroupScan(); final Table hiveTable = hiveScan.hiveReadEntry.getTable(); final Class<? extends InputFormat> tableInputFormat = getInputFormatFromSD(hiveTable, hiveTable.getSd()); if (tableInputFormat == null || !tableInputFormat.equals(MapredParquetInputFormat.class)) { return false; } final List<HivePartition> partitions = hiveScan.hiveReadEntry.getHivePartitionWrappers(); if (partitions == null) { return true; } // Make sure all partitions have the same input format as the table input format for (HivePartition partition : partitions) { Class<? extends InputFormat> inputFormat = getInputFormatFromSD(hiveTable, partition.getPartition().getSd()); if (inputFormat == null || !inputFormat.equals(tableInputFormat)) { return false; } } return true; }
/** * Get the input format from given {@link StorageDescriptor} * @param hiveTable * @param sd * @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning. */ private Class<? extends InputFormat> getInputFormatFromSD(final Table hiveTable, final StorageDescriptor sd) { try { return (Class<? extends InputFormat>) Class.forName(sd.getInputFormat()); } catch (ReflectiveOperationException e) { logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]", hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e); return null; } }
private void splitInput(final Properties properties, final StorageDescriptor sd, final Partition partition) throws ReflectiveOperationException, IOException { final JobConf job = new JobConf(); for (final Object obj : properties.keySet()) { job.set((String) obj, (String) properties.get(obj)); } for (final Map.Entry<String, String> entry : hiveReadEntry.hiveConfigOverride.entrySet()) { job.set(entry.getKey(), entry.getValue()); } InputFormat<?, ?> format = (InputFormat<?, ?>) Class.forName(sd.getInputFormat()).getConstructor().newInstance(); job.setInputFormat(format.getClass()); final Path path = new Path(sd.getLocation()); final FileSystem fs = path.getFileSystem(job); if (fs.exists(path)) { FileInputFormat.addInputPath(job, path); format = job.getInputFormat(); for (final InputSplit split : format.getSplits(job, 1)) { inputSplits.add(split); partitionMap.put(split, partition); } } final String numRowsProp = properties.getProperty("numRows"); logger.trace("HiveScan num rows property = {}", numRowsProp); if (numRowsProp != null) { final long numRows = Long.valueOf(numRowsProp); // starting from hive-0.13, when no statistics are available, this property is set to -1 // it's important to note that the value returned by hive may not be up to date if (numRows > 0) { rowCount += numRows; } } }
public void testAddInputPathWithFormat() { final JobConf conf = new JobConf(); MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class); MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class); final Map<Path, InputFormat> inputs = MultipleInputs .getInputFormatMap(conf); assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass()); assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")) .getClass()); }