public HiveVectorizedReaderSetting( final FileSplit split , final JobConf job , final HiveReaderSetting hiveReaderConfig ) throws IOException{ this.hiveReaderConfig = hiveReaderConfig; rbCtx = Utilities.getVectorizedRowBatchCtx( job ); partitionValues = new Object[rbCtx.getPartitionColumnCount()]; if( 0 < partitionValues.length ){ rbCtx.getPartitionValues( rbCtx, job, split, partitionValues ); } TypeInfo[] typeInfos = rbCtx.getRowColumnTypeInfos(); columnNames = rbCtx.getRowColumnNames(); needColumnIds = createNeedColumnId( ColumnProjectionUtils.getReadColumnIDs( job ) ); projectionColumn = new boolean[columnNames.length]; assignors = new IColumnVectorAssignor[columnNames.length]; for( int id : needColumnIds ){ projectionColumn[id] = true; assignors[id] = ColumnVectorAssignorFactory.create( typeInfos[id] ); } }
@Override public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException { FileSplit fileSplit = (FileSplit)split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem( job ); long fileLength = fs.getLength( path ); long start = fileSplit.getStart(); long length = fileSplit.getLength(); InputStream in = fs.open( path ); IJobReporter jobReporter = new HadoopJobReporter( reporter ); jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) ); HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job ); if ( hiveConfig.isVectorMode() ){ IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig ); return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter ); } else{ return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter ); } }
/** * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input * format finds the row group numbers for input split. */ private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split, final ParquetMetadata footer) throws IOException { final List<BlockMetaData> blocks = footer.getBlocks(); final long splitStart = split.getStart(); final long splitLength = split.getLength(); final List<Integer> rowGroupNums = Lists.newArrayList(); int i = 0; for (final BlockMetaData block : blocks) { final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) { rowGroupNums.add(i); } i++; } return rowGroupNums; }
public DelimitedAndFixedWidthRecordReader(JobConf conf, FileSplit split) throws IOException { lengthsAndDelimiters = DelimitedAndFixedWidthHelper .modifyIdentifier(conf.get("lengthsAndDelimiters").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR)); lengthsAndDelimitersType = conf.get("lengthsAndDelimitersType").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR); quote = conf.get("quote"); charsetName = conf.get("charsetName"); start = split.getStart(); pos = start; end = start + split.getLength(); file = split.getPath(); fs = file.getFileSystem(conf); fileIn = fs.open(split.getPath()); fileIn.seek(start); inputStreamReader = new InputStreamReader(fileIn, charsetName); singleChar = new char[1]; stringBuilder = new StringBuilder(); isQuotePresent = isQuotePresent(quote); }
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter, JobConf job, FileSystem fs) throws IOException { super(in, split, reporter, job, fs); beginMark_ = checkJobGet(CONF_NS + "begin"); endMark_ = checkJobGet(CONF_NS + "end"); maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000); lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_); synched_ = false; slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false); if (slowMatch_) { beginPat_ = makePatternCDataOrMark(beginMark_); endPat_ = makePatternCDataOrMark(endMark_); } init(); }
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit) split; FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job); FSDataInputStream is = fs.open(fileSplit.getPath()); byte[] header = new byte[3]; RecordReader reader = null; try { is.readFully(header); } catch (EOFException eof) { reader = textInputFormat.getRecordReader(split, job, reporter); } finally { is.close(); } if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') { reader = seqFileInputFormat.getRecordReader(split, job, reporter); } else { reader = textInputFormat.getRecordReader(split, job, reporter); } return reader; }
public FileSplitParquetRecordReader( final OperatorContext oContext, final ParquetReaderFactory readerFactory, final List<SchemaPath> columnsToRead, final List<SchemaPath> groupScanColumns, final List<FilterCondition> conditions, final FileSplit fileSplit, final ParquetMetadata footer, final JobConf jobConf, final boolean vectorize, final boolean enableDetailedTracing ) { this.oContext = oContext; this.columnsToRead = columnsToRead; this.groupScanColumns = groupScanColumns; this.conditions = conditions; this.fileSplit = fileSplit; this.footer = footer; this.jobConf = jobConf; this.readerFactory = readerFactory; this.vectorize = vectorize; this.enableDetailedTracing = enableDetailedTracing; }
/** * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input * format finds the row group numbers for input split. */ private static List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split, final ParquetMetadata footer) throws IOException { final List<BlockMetaData> blocks = footer.getBlocks(); final long splitStart = split.getStart(); final long splitLength = split.getLength(); final List<Integer> rowGroupNums = Lists.newArrayList(); int i = 0; for (final BlockMetaData block : blocks) { final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) { rowGroupNums.add(i); } i++; } return rowGroupNums; }
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader( InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException { // CombineFileSplit indicates the new export format which includes a manifest file if (inputSplit instanceof CombineFileSplit) { int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1); if (version != ExportManifestRecordWriter.FORMAT_VERSION) { throw new IOException("Unknown version: " + job.get(DynamoDBConstants .EXPORT_FORMAT_VERSION)); } return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter); } else if (inputSplit instanceof FileSplit) { // FileSplit indicates the old data pipeline format which doesn't include a manifest file Path path = ((FileSplit) inputSplit).getPath(); return new ImportRecordReader(job, path); } else { throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:" + " " + inputSplit.getClass()); } }
public IndexRRecordReader(InputSplit inputSplit, Configuration configuration) throws IOException { FileSplit fileSplit = (FileSplit) inputSplit; Preconditions.checkState(fileSplit.getStart() == 0, "Segment should not splited"); Path filePath = fileSplit.getPath(); // Hive may ask to read a file located on local file system. // We have to get the real file system by path's schema. FileSystem fileSystem = FileSystem.get(filePath.toUri(), FileSystem.get(configuration).getConf()); if (SegmentHelper.checkSegmentByPath(filePath)) { ByteBufferReader.Opener opener = ByteBufferReader.Opener.create(fileSystem, filePath); IntegratedSegment.Fd fd = IntegratedSegment.Fd.create(filePath.toString(), opener); if (fd != null) { segment = fd.open(); offset = 0L; rowIterator = segment.rowTraversal().iterator(); getIncludeColumns(configuration, segment); } } else { LOG.warn("ignore " + filePath); } }
@Override public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException { // first, merge input table properties (since there's no access to them ...) Settings settings = HadoopSettingsManager.loadFrom(job); //settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES))); Log log = LogFactory.getLog(getClass()); // move on to initialization InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log); settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenateAndUriEncode(HiveUtils.columnToAlias(settings), ",")); // set read resource settings.setResourceRead(settings.getResourceRead()); HiveUtils.init(settings, log); // decorate original splits as FileSplit InputSplit[] shardSplits = super.getSplits(job, numSplits); FileSplit[] wrappers = new FileSplit[shardSplits.length]; Path path = new Path(job.get(HiveConstants.TABLE_LOCATION)); for (int i = 0; i < wrappers.length; i++) { wrappers[i] = new EsHiveSplit(shardSplits[i], path); } return wrappers; }
@Test public void testOrcDataStream() throws Exception { HiveOutputFormat<?, ?> outputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat(); InputFormat<?, ?> inputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcInputFormat(); @SuppressWarnings("deprecation") SerDe serde = new org.apache.hadoop.hive.ql.io.orc.OrcSerde(); File file = File.createTempFile("presto_test", "orc"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS); testPageSourceFactory(new OrcPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
@Test(enabled = false) public void testRcTextPageSource() throws Exception { HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat(); InputFormat<?, ?> inputFormat = new RCFileInputFormat<>(); @SuppressWarnings("deprecation") SerDe serde = new ColumnarSerDe(); File file = File.createTempFile("presto_test", "rc-binary"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS); testPageSourceFactory(new RcFilePageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { InputSplit[] tmp = super.getSplits(job, numSplits); //get partitioning information MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job); PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job); PartitionFormat pf = new PartitionFormat(dpf, -1); int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc)); String fname = MRJobConfiguration.getPartitioningFilename(job); //create wrapper splits InputSplit[] ret = new InputSplit[ tmp.length ]; for( int i=0; i<tmp.length; i++ ) { //check for robustness of subsequent cast if( tmp[i] instanceof FileSplit ) ret[i] = new RemoteParForColocatedFileSplit( (FileSplit) tmp[i], fname, blen ); else ret[i] = tmp[i]; } return ret; }
/** * Constructor * @param job * @param split * @throws IOException */ public LineDocRecordReader(Configuration job, FileSplit split) throws IOException { long start = split.getStart(); long end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); InputStream in = fileIn; boolean skipFirstLine = false; if (start != 0) { skipFirstLine = true; // wait till BufferedInputStream to skip --start; fileIn.seek(start); } this.in = new BufferedInputStream(in); if (skipFirstLine) { // skip first line and re-establish "start". start += LineDocRecordReader.readData(this.in, null, EOL); } this.start = start; this.pos = start; this.end = end; }
@SuppressWarnings("unchecked") public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(taggedInputSplit.getInputFormatClass(), conf); InputSplit inputSplit = taggedInputSplit.getInputSplit(); if (inputSplit instanceof FileSplit) { FileSplit fileSplit = (FileSplit) inputSplit; conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString()); conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart()); conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength()); } return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter); }
public void map(Text key, Text value, OutputCollector<Text,Text> output, Reporter reporter) throws IOException { if (lastKey == null) { filename = getFilename((FileSplit) reporter.getInputSplit()); output.collect(new Text(filename + ":begin"), key); lastKey = new Text(); this.output = output; } else { if (key.compareTo(lastKey) < 0) { output.collect(error, new Text("misorder in " + filename + " last: '" + lastKey + "' current: '" + key + "'")); } } lastKey.set(key); }
@Test public void testParquetUseColumnNames() throws Exception { List<TestColumn> testColumns = getTestColumnsSupportedByParquet(); HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat(); InputFormat<?, ?> inputFormat = new MapredParquetInputFormat(); @SuppressWarnings("deprecation") SerDe serde = new ParquetHiveSerDe(); File file = File.createTempFile("presto_test", "parquet"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS); // Reverse the order of the columns to test access by name, not by index Collections.reverse(testColumns); HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(true); testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
@Test public void testParquet() throws Exception { List<TestColumn> testColumns = getTestColumnsSupportedByParquet(); HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat(); InputFormat<?, ?> inputFormat = new MapredParquetInputFormat(); @SuppressWarnings("deprecation") SerDe serde = new ParquetHiveSerDe(); File file = File.createTempFile("presto_test", "parquet"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS); HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false); testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
@Test public void testDwrfDataStream() throws Exception { List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> { ObjectInspector objectInspector = testColumn.getObjectInspector(); return !hasType(objectInspector, PrimitiveCategory.DATE); })); HiveOutputFormat<?, ?> outputFormat = new com.facebook.hive.orc.OrcOutputFormat(); InputFormat<?, ?> inputFormat = new com.facebook.hive.orc.OrcInputFormat(); @SuppressWarnings("deprecation") SerDe serde = new com.facebook.hive.orc.OrcSerde(); File file = File.createTempFile("presto_test", "dwrf"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS); testPageSourceFactory(new DwrfPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, testColumns); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
public ParsedRecordReader ( FileSplit split, Configuration conf, Class<? extends Parser> parser_class, Trees args ) throws IOException { start = split.getStart(); end = start + split.getLength(); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); fsin = fs.open(split.getPath()); try { parser = parser_class.newInstance(); } catch (Exception ex) { throw new Error("Unrecognized parser:"+parser_class); }; parser.initialize(args); parser.open(fsin,start,end); result = null; }
public XMLRecordReader(FileSplit split, JobConf jobConf) throws IOException { log.info("Setting up XMLRecordReader for path: [" + split.getPath() + "]"); log.info("startTag=" + jobConf.get(START_TAG_KEY) + ", endTag=" + jobConf.get(END_TAG_KEY)); startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8"); endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8"); // open the file and seek to the start of the split start = split.getStart(); end = start + split.getLength(); Path file = split.getPath(); FileSystem fs = file.getFileSystem(jobConf); path = split.getPath().getName(); fsin = fs.open(split.getPath()); fsin.seek(start); }
@Test public void testRCBinary() throws Exception { List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> { // RC file does not support complex type as key of a map return !testColumn.getName().equals("t_map_null_key_complex_key_value"); })); HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat(); InputFormat<?, ?> inputFormat = new RCFileInputFormat<>(); @SuppressWarnings("deprecation") SerDe serde = new LazyBinaryColumnarSerDe(); File file = File.createTempFile("presto_test", "rc-binary"); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS); testCursorProvider(new ColumnarBinaryHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS); testCursorProvider(new GenericHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
@Test(enabled = false) public void testRcBinaryPageSource() throws Exception { HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat(); InputFormat<?, ?> inputFormat = new RCFileInputFormat<>(); @SuppressWarnings("deprecation") SerDe serde = new LazyBinaryColumnarSerDe(); File file = File.createTempFile("presto_test", "rc-binary"); file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS); testPageSourceFactory(new RcFilePageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
public RangePickRecordReader(JobConf job, FileSplit split) throws IOException { parseSelectedRangeString(job.get(SELECTED_RANGES)); // check if the current part file needs to be processed path = split.getPath(); totLength = split.getLength(); currentStream = IOUtilFunctions.getFileSystem(path, job).open(path); currPart = getIndexInTheArray(path.getName()); if ( currPart < beginPart || currPart > endPart ) { noRecordsNeeded = true; return; } int part0=job.getInt(PARTITION_OF_ZERO, -1); boolean contain0s=false; long numZeros =0; if(part0==currPart) { contain0s = true; numZeros = job.getLong(NUMBER_OF_ZERO, 0); } reader=new ReadWithZeros(currentStream, contain0s, numZeros); }
/** * Helper function that iterates through Recrord Reader and asserts RecrordCount */ public void testForReadAllRecordsNotStrict(String fileName, int expectedRecordCount) throws IOException { CsvInputFormat inputFormat = helper.createCSVInputFormat(conf); File inputFile = helper.getFile(fileName); Path inputPath = new Path(inputFile.getAbsoluteFile().toURI().toString()); FileSplit split = helper.createFileSplit(inputPath, 0, inputFile.length()); RecordReader createdReader = helper.createRecordReader(inputFormat, split, jobConf); LongWritable key = new LongWritable(); ListWritable<Text> value = new ListWritable<Text>(Text.class); int actualRecordCount = 0; while (createdReader.next(key, value)) { actualRecordCount++; } assertEquals(expectedRecordCount, actualRecordCount); }
public DrillTextRecordReader(FileSplit split, Configuration fsConf, FragmentContext context, char delimiter, List<SchemaPath> columns) { this.delimiter = (byte) delimiter; this.split = split; setColumns(columns); if (!isStarQuery()) { String pathStr; for (SchemaPath path : columns) { assert path.getRootSegment().isNamed(); pathStr = path.getRootSegment().getPath(); Preconditions.checkArgument(pathStr.equals(COL_NAME) || (pathStr.equals("*") && path.getRootSegment().getChild() == null), "Selected column(s) must have name 'columns' or must be plain '*'"); if (path.getRootSegment().getChild() != null) { Preconditions.checkArgument(path.getRootSegment().getChild().isArray(), "Selected column must be an array index"); int index = path.getRootSegment().getChild().getArraySegment().getIndex(); columnIds.add(index); } } Collections.sort(columnIds); numCols = columnIds.size(); } TextInputFormat inputFormat = new TextInputFormat(); JobConf job = new JobConf(fsConf); job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE)); job.setInputFormat(inputFormat.getClass()); try { reader = inputFormat.getRecordReader(split, job, Reporter.NULL); key = reader.createKey(); value = reader.createValue(); totalRecordsRead = 0; } catch (Exception e) { handleAndRaise("Failure in creating record reader", e); } }
@Override public RecordReader getRecordReader(FragmentContext context, DrillFileSystem dfs, FileWork fileWork, List<SchemaPath> columns) throws ExecutionSetupException { Path path = dfs.makeQualified(new Path(fileWork.getPath())); FileSplit split = new FileSplit(path, fileWork.getStart(), fileWork.getLength(), new String[]{""}); if (context.getOptions().getOption(ExecConstants.ENABLE_NEW_TEXT_READER_KEY).bool_val == true) { TextParsingSettings settings = new TextParsingSettings(); settings.set((TextFormatConfig)formatConfig); return new CompliantTextRecordReader(split, dfs, context, settings, columns); } else { char delim = ((TextFormatConfig)formatConfig).getFieldDelimiter(); return new DrillTextRecordReader(split, dfs.getConf(), context, delim, columns); } }