/** * Test using the gzip codec for reading */ @Test(timeout=10000) public void testGzip() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); FileInputFormat.setInputPaths(job, workDir); CombineTextInputFormat format = new CombineTextInputFormat(); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 1", 1, splits.length); List<Text> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 8, results.size()); final String[] firstList = {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"}; final String[] secondList = {"this is a test", "of gzip"}; String first = results.get(0).toString(); if (first.equals(firstList[0])) { testResults(results, firstList, secondList); } else if (first.equals(secondList[0])) { testResults(results, secondList, firstList); } else { fail("unexpected first token!"); } }
@Test(timeout=10000) public void testFormat() throws Exception { JobConf job = new JobConf(defaultConf); Random random = new Random(); long seed = random.nextLong(); LOG.info("seed = "+seed); random.setSeed(seed); localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, workDir); final int length = 10000; final int numFiles = 10; createFiles(length, numFiles, random); // create a combined split for the files CombineTextInputFormat format = new CombineTextInputFormat(); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(length/20)+1; LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("splitting: got = " + splits.length); // we should have a single split as the length is comfortably smaller than // the block size assertEquals("We got more than one splits!", 1, splits.length); InputSplit split = splits[0]; assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass()); // check the split BitSet bits = new BitSet(length); LOG.debug("split= " + split); RecordReader<LongWritable, Text> reader = format.getRecordReader(split, job, voidReporter); try { int count = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) { LOG.warn("conflict with " + v + " at position "+reader.getPos()); } assertFalse("Key in multiple partitions.", bits.get(v)); bits.set(v); count++; } LOG.info("splits="+split+" count=" + count); } finally { reader.close(); } assertEquals("Some keys in no partition.", length, bits.cardinality()); } }