Java 类org.apache.hadoop.mapred.FileSplit 实例源码

项目：multiple-dimension-spread 文件：HiveVectorizedReaderSetting.java

public HiveVectorizedReaderSetting( final FileSplit split , final JobConf job , final HiveReaderSetting hiveReaderConfig ) throws IOException{
  this.hiveReaderConfig = hiveReaderConfig;

  rbCtx = Utilities.getVectorizedRowBatchCtx( job );
  partitionValues = new Object[rbCtx.getPartitionColumnCount()];
  if( 0 < partitionValues.length ){
    rbCtx.getPartitionValues( rbCtx, job, split, partitionValues );
  }

  TypeInfo[] typeInfos = rbCtx.getRowColumnTypeInfos();
  columnNames = rbCtx.getRowColumnNames();
  needColumnIds = createNeedColumnId( ColumnProjectionUtils.getReadColumnIDs( job ) );

  projectionColumn = new boolean[columnNames.length];
  assignors = new IColumnVectorAssignor[columnNames.length];
  for( int id : needColumnIds ){
    projectionColumn[id] = true;
    assignors[id] = ColumnVectorAssignorFactory.create( typeInfos[id] );
  }
}

项目：multiple-dimension-spread 文件：MDSHiveLineInputFormat.java

@Override
public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException {
  FileSplit fileSplit = (FileSplit)split;
  Path path = fileSplit.getPath();
  FileSystem fs = path.getFileSystem( job );
  long fileLength = fs.getLength( path );
  long start = fileSplit.getStart();
  long length = fileSplit.getLength();
  InputStream in = fs.open( path );
  IJobReporter jobReporter = new HadoopJobReporter( reporter );
  jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) );
  HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job );
  if ( hiveConfig.isVectorMode() ){
    IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig );
    return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter );
  }
  else{
    return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter );
  }
}

项目：QDrill 文件：HiveDrillNativeScanBatchCreator.java

/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}

项目：Hydrograph 文件：DelimitedAndFixedWidthRecordReader.java

public DelimitedAndFixedWidthRecordReader(JobConf conf, FileSplit split)
        throws IOException {
    lengthsAndDelimiters = DelimitedAndFixedWidthHelper
            .modifyIdentifier(conf.get("lengthsAndDelimiters").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR));
    lengthsAndDelimitersType = conf.get("lengthsAndDelimitersType").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR);
    quote = conf.get("quote");
    charsetName = conf.get("charsetName");
    start = split.getStart();
    pos = start;
    end = start + split.getLength();
    file = split.getPath();
    fs = file.getFileSystem(conf);
    fileIn = fs.open(split.getPath());
    fileIn.seek(start);
    inputStreamReader = new InputStreamReader(fileIn, charsetName);
    singleChar = new char[1];
    stringBuilder = new StringBuilder();
    isQuotePresent = isQuotePresent(quote);
}

项目：hadoop 文件：StreamXmlRecordReader.java

public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

项目：hadoop 文件：AutoInputFormat.java

public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}

项目：dremio-oss 文件：FileSplitParquetRecordReader.java

public FileSplitParquetRecordReader(
    final OperatorContext oContext,
    final ParquetReaderFactory readerFactory,
    final List<SchemaPath> columnsToRead,
    final List<SchemaPath> groupScanColumns,
    final List<FilterCondition> conditions,
    final FileSplit fileSplit,
    final ParquetMetadata footer,
    final JobConf jobConf,
    final boolean vectorize,
    final boolean enableDetailedTracing
) {
  this.oContext = oContext;
  this.columnsToRead = columnsToRead;
  this.groupScanColumns = groupScanColumns;
  this.conditions = conditions;
  this.fileSplit = fileSplit;
  this.footer = footer;
  this.jobConf = jobConf;
  this.readerFactory = readerFactory;
  this.vectorize = vectorize;
  this.enableDetailedTracing = enableDetailedTracing;
}

项目：dremio-oss 文件：FileSplitParquetRecordReader.java

/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private static List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}

项目：aliyun-oss-hadoop-fs 文件：StreamXmlRecordReader.java

public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

项目：aliyun-oss-hadoop-fs 文件：AutoInputFormat.java

public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}

项目：big-c 文件：StreamXmlRecordReader.java

public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

项目：big-c 文件：AutoInputFormat.java

public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}

项目：emr-dynamodb-connector 文件：ImportRecordReaderFactory.java

static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}

项目：indexr 文件：IndexRRecordReader.java

public IndexRRecordReader(InputSplit inputSplit, Configuration configuration) throws IOException {
    FileSplit fileSplit = (FileSplit) inputSplit;
    Preconditions.checkState(fileSplit.getStart() == 0, "Segment should not splited");

    Path filePath = fileSplit.getPath();
    // Hive may ask to read a file located on local file system.
    // We have to get the real file system by path's schema.
    FileSystem fileSystem = FileSystem.get(filePath.toUri(), FileSystem.get(configuration).getConf());

    if (SegmentHelper.checkSegmentByPath(filePath)) {
        ByteBufferReader.Opener opener = ByteBufferReader.Opener.create(fileSystem, filePath);
        IntegratedSegment.Fd fd = IntegratedSegment.Fd.create(filePath.toString(), opener);
        if (fd != null) {
            segment = fd.open();
            offset = 0L;
            rowIterator = segment.rowTraversal().iterator();
            getIncludeColumns(configuration, segment);
        }
    } else {
        LOG.warn("ignore " + filePath);
    }
}

项目：es-hadoop-v2.2.0 文件：EsHiveInputFormat.java

@Override
public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    // first, merge input table properties (since there's no access to them ...)
    Settings settings = HadoopSettingsManager.loadFrom(job);
    //settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES)));

    Log log = LogFactory.getLog(getClass());
    // move on to initialization
    InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log);
    settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenateAndUriEncode(HiveUtils.columnToAlias(settings), ","));
    // set read resource
    settings.setResourceRead(settings.getResourceRead());
    HiveUtils.init(settings, log);

    // decorate original splits as FileSplit
    InputSplit[] shardSplits = super.getSplits(job, numSplits);
    FileSplit[] wrappers = new FileSplit[shardSplits.length];
    Path path = new Path(job.get(HiveConstants.TABLE_LOCATION));
    for (int i = 0; i < wrappers.length; i++) {
        wrappers[i] = new EsHiveSplit(shardSplits[i], path);
    }
    return wrappers;
}

项目：drill 文件：HiveDrillNativeScanBatchCreator.java

/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}

项目：presto 文件：TestHiveFileFormats.java

@Test
public void testOrcDataStream()
        throws Exception
{
    HiveOutputFormat<?, ?> outputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat();
    InputFormat<?, ?> inputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcInputFormat();
    @SuppressWarnings("deprecation")
    SerDe serde = new org.apache.hadoop.hive.ql.io.orc.OrcSerde();
    File file = File.createTempFile("presto_test", "orc");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
        testPageSourceFactory(new OrcPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

项目：hadoop-2.6.0-cdh5.4.3 文件：StreamXmlRecordReader.java

public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

项目：hadoop-2.6.0-cdh5.4.3 文件：AutoInputFormat.java

public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}

项目：presto 文件：TestHiveFileFormats.java

@Test(enabled = false)
public void testRcTextPageSource()
        throws Exception
{
    HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
    InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
    @SuppressWarnings("deprecation")
    SerDe serde = new ColumnarSerDe();
    File file = File.createTempFile("presto_test", "rc-binary");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
        testPageSourceFactory(new RcFilePageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

项目：systemml 文件：RemoteParForColocatedNLineInputFormat.java

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException 
{
    InputSplit[] tmp = super.getSplits(job, numSplits);

    //get partitioning information
    MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
    PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job);
    PartitionFormat pf = new PartitionFormat(dpf, -1);
    int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc));
    String fname = MRJobConfiguration.getPartitioningFilename(job);

    //create wrapper splits 
    InputSplit[] ret = new InputSplit[ tmp.length ];
    for( int i=0; i<tmp.length; i++ ) {
        //check for robustness of subsequent cast
        if( tmp[i] instanceof FileSplit ) 
            ret[i] = new RemoteParForColocatedFileSplit( (FileSplit) tmp[i], fname, blen );
        else
            ret[i] = tmp[i];
    }
    return ret;
}

项目：hadoop-2.6.0-cdh5.4.3 文件：LineDocRecordReader.java

/**
 * Constructor
 * @param job
 * @param split  
 * @throws IOException
 */
public LineDocRecordReader(Configuration job, FileSplit split)
    throws IOException {
  long start = split.getStart();
  long end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  InputStream in = fileIn;
  boolean skipFirstLine = false;
  if (start != 0) {
    skipFirstLine = true; // wait till BufferedInputStream to skip
    --start;
    fileIn.seek(start);
  }

  this.in = new BufferedInputStream(in);
  if (skipFirstLine) { // skip first line and re-establish "start".
    start += LineDocRecordReader.readData(this.in, null, EOL);
  }
  this.start = start;
  this.pos = start;
  this.end = end;
}

项目：hadoop-2.6.0-cdh5.4.3 文件：StreamXmlRecordReader.java

public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

项目：hadoop-2.6.0-cdh5.4.3 文件：AutoInputFormat.java

public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}

项目：systemml 文件：DelegatingInputFormat.java

@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.

  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  InputSplit inputSplit = taggedInputSplit.getInputSplit();
  if (inputSplit instanceof FileSplit) {
     FileSplit fileSplit = (FileSplit) inputSplit;
     conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
     conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
     conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
   }

  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}

项目：hadoop-EAR 文件：TeraValidate.java

public void map(Text key, Text value, OutputCollector<Text,Text> output,
                Reporter reporter) throws IOException {
  if (lastKey == null) {
    filename = getFilename((FileSplit) reporter.getInputSplit());
    output.collect(new Text(filename + ":begin"), key);
    lastKey = new Text();
    this.output = output;
  } else {
    if (key.compareTo(lastKey) < 0) {
      output.collect(error, new Text("misorder in " + filename + 
                                     " last: '" + lastKey + 
                                     "' current: '" + key + "'"));
    }
  }
  lastKey.set(key);
}

项目：presto 文件：TestHiveFileFormats.java

@Test
public void testParquetUseColumnNames()
        throws Exception
{
    List<TestColumn> testColumns = getTestColumnsSupportedByParquet();

    HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
    InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
    @SuppressWarnings("deprecation")
    SerDe serde = new ParquetHiveSerDe();
    File file = File.createTempFile("presto_test", "parquet");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
        // Reverse the order of the columns to test access by name, not by index
        Collections.reverse(testColumns);
        HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(true);
        testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

项目：hadoop-EAR 文件：StreamXmlRecordReader.java

public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

项目：hops 文件：StreamXmlRecordReader.java

public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

项目：presto 文件：TestHiveFileFormats.java

@Test
public void testParquet()
        throws Exception
{
    List<TestColumn> testColumns = getTestColumnsSupportedByParquet();

    HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
    InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
    @SuppressWarnings("deprecation")
    SerDe serde = new ParquetHiveSerDe();
    File file = File.createTempFile("presto_test", "parquet");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
        HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false);
        testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

项目：presto 文件：TestHiveFileFormats.java

@Test
public void testDwrfDataStream()
        throws Exception
{
    List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> {
        ObjectInspector objectInspector = testColumn.getObjectInspector();
        return !hasType(objectInspector, PrimitiveCategory.DATE);
    }));

    HiveOutputFormat<?, ?> outputFormat = new com.facebook.hive.orc.OrcOutputFormat();
    InputFormat<?, ?> inputFormat = new com.facebook.hive.orc.OrcInputFormat();
    @SuppressWarnings("deprecation")
    SerDe serde = new com.facebook.hive.orc.OrcSerde();
    File file = File.createTempFile("presto_test", "dwrf");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
        testPageSourceFactory(new DwrfPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, testColumns);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

项目：hops 文件：AutoInputFormat.java

public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}

项目：incubator-mrql 文件：StormParsedInputFormat.java

public ParsedRecordReader ( FileSplit split,
    Configuration conf,
    Class<? extends Parser> parser_class,
    Trees args ) throws IOException {
    start = split.getStart();
    end = start + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    fsin = fs.open(split.getPath());
    try {
        parser = parser_class.newInstance();
    } catch (Exception ex) {
        throw new Error("Unrecognized parser:"+parser_class);
    };
    parser.initialize(args);
    parser.open(fsin,start,end);
    result = null;
}

项目：solr-hadoop-common 文件：XMLInputFormat.java

public XMLRecordReader(FileSplit split, JobConf jobConf) throws IOException {
  log.info("Setting up XMLRecordReader for path: [" + split.getPath() + "]");
  log.info("startTag=" + jobConf.get(START_TAG_KEY) + ", endTag=" + jobConf.get(END_TAG_KEY));

  startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
  endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");

  // open the file and seek to the start of the split
  start = split.getStart();
  end = start + split.getLength();

  Path file = split.getPath();
  FileSystem fs = file.getFileSystem(jobConf);

  path = split.getPath().getName();

  fsin = fs.open(split.getPath());
  fsin.seek(start);
}

项目：presto 文件：TestHiveFileFormats.java

@Test
public void testRCBinary()
        throws Exception
{
    List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> {
        // RC file does not support complex type as key of a map
        return !testColumn.getName().equals("t_map_null_key_complex_key_value");
    }));

    HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
    InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
    @SuppressWarnings("deprecation")
    SerDe serde = new LazyBinaryColumnarSerDe();
    File file = File.createTempFile("presto_test", "rc-binary");
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
        testCursorProvider(new ColumnarBinaryHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
        testCursorProvider(new GenericHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

项目：presto 文件：TestHiveFileFormats.java

@Test(enabled = false)
public void testRcBinaryPageSource()
        throws Exception
{
    HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
    InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
    @SuppressWarnings("deprecation")
    SerDe serde = new LazyBinaryColumnarSerDe();
    File file = File.createTempFile("presto_test", "rc-binary");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
        testPageSourceFactory(new RcFilePageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

项目：systemml 文件：PickFromCompactInputFormat.java

public RangePickRecordReader(JobConf job, FileSplit split) 
    throws IOException 
{
    parseSelectedRangeString(job.get(SELECTED_RANGES));

    // check if the current part file needs to be processed
    path = split.getPath();
    totLength = split.getLength();
    currentStream = IOUtilFunctions.getFileSystem(path, job).open(path);
    currPart = getIndexInTheArray(path.getName());

    if ( currPart < beginPart || currPart > endPart ) {
        noRecordsNeeded = true;
        return;
    }

    int part0=job.getInt(PARTITION_OF_ZERO, -1);
    boolean contain0s=false;
    long numZeros =0;
    if(part0==currPart) {
        contain0s = true;
        numZeros = job.getLong(NUMBER_OF_ZERO, 0);
    }
    reader=new ReadWithZeros(currentStream, contain0s, numZeros);
}

项目：cascading.csv 文件：CsvRecordReaderTest.java

/**
 * Helper function that iterates through Recrord Reader and asserts RecrordCount
 */
public void testForReadAllRecordsNotStrict(String fileName, int expectedRecordCount) throws IOException {
  CsvInputFormat inputFormat = helper.createCSVInputFormat(conf);
  File inputFile = helper.getFile(fileName);
  Path inputPath = new Path(inputFile.getAbsoluteFile().toURI().toString());
  FileSplit split = helper.createFileSplit(inputPath, 0, inputFile.length());

  RecordReader createdReader = helper.createRecordReader(inputFormat, split, jobConf);

  LongWritable key = new LongWritable();
  ListWritable<Text> value = new ListWritable<Text>(Text.class);

  int actualRecordCount = 0;

  while (createdReader.next(key, value)) {
    actualRecordCount++;
  }
  assertEquals(expectedRecordCount, actualRecordCount);
}

项目：QDrill 文件：DrillTextRecordReader.java

public DrillTextRecordReader(FileSplit split, Configuration fsConf, FragmentContext context,
    char delimiter, List<SchemaPath> columns) {
  this.delimiter = (byte) delimiter;
  this.split = split;
  setColumns(columns);

  if (!isStarQuery()) {
    String pathStr;
    for (SchemaPath path : columns) {
      assert path.getRootSegment().isNamed();
      pathStr = path.getRootSegment().getPath();
      Preconditions.checkArgument(pathStr.equals(COL_NAME) || (pathStr.equals("*") && path.getRootSegment().getChild() == null),
          "Selected column(s) must have name 'columns' or must be plain '*'");

      if (path.getRootSegment().getChild() != null) {
        Preconditions.checkArgument(path.getRootSegment().getChild().isArray(), "Selected column must be an array index");
        int index = path.getRootSegment().getChild().getArraySegment().getIndex();
        columnIds.add(index);
      }
    }
    Collections.sort(columnIds);
    numCols = columnIds.size();
  }

  TextInputFormat inputFormat = new TextInputFormat();
  JobConf job = new JobConf(fsConf);
  job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
  job.setInputFormat(inputFormat.getClass());
  try {
    reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
    key = reader.createKey();
    value = reader.createValue();
    totalRecordsRead = 0;
  } catch (Exception e) {
    handleAndRaise("Failure in creating record reader", e);
  }
}

项目：QDrill 文件：TextFormatPlugin.java

@Override
public RecordReader getRecordReader(FragmentContext context, DrillFileSystem dfs, FileWork fileWork,
    List<SchemaPath> columns) throws ExecutionSetupException {
  Path path = dfs.makeQualified(new Path(fileWork.getPath()));
  FileSplit split = new FileSplit(path, fileWork.getStart(), fileWork.getLength(), new String[]{""});

  if (context.getOptions().getOption(ExecConstants.ENABLE_NEW_TEXT_READER_KEY).bool_val == true) {
    TextParsingSettings settings = new TextParsingSettings();
    settings.set((TextFormatConfig)formatConfig);
    return new CompliantTextRecordReader(split, dfs, context, settings, columns);
  } else {
    char delim = ((TextFormatConfig)formatConfig).getFieldDelimiter();
    return new DrillTextRecordReader(split, dfs.getConf(), context, delim, columns);
  }
}