/** * Returns a {@link OutputStream} for a file that might need * compression. */ static OutputStream getPossiblyCompressedOutputStream(Path file, Configuration conf) throws IOException { FileSystem fs = file.getFileSystem(conf); JobConf jConf = new JobConf(conf); if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) { // get the codec class Class<? extends CompressionCodec> codecClass = org.apache.hadoop.mapred.FileOutputFormat .getOutputCompressorClass(jConf, GzipCodec.class); // get the codec implementation CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); // add the appropriate extension file = file.suffix(codec.getDefaultExtension()); if (isCompressionEmulationEnabled(conf)) { FSDataOutputStream fileOut = fs.create(file, false); return new DataOutputStream(codec.createOutputStream(fileOut)); } } return fs.create(file, false); }
@Test /** * Create an IFile.Writer using GzipCodec since this code does not * have a compressor when run via the tests (ie no native libraries). */ public void testIFileWriterWithCodec() throws Exception { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.getLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).getRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.setConf(conf); IFile.Writer<Text, Text> writer = new IFile.Writer<Text, Text>(conf, rfs.create(path), Text.class, Text.class, codec, null); writer.close(); }
@Test /** Same as above but create a reader. */ public void testIFileReaderWithCodec() throws Exception { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.getLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).getRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.setConf(conf); FSDataOutputStream out = rfs.create(path); IFile.Writer<Text, Text> writer = new IFile.Writer<Text, Text>(conf, out, Text.class, Text.class, codec, null); writer.close(); FSDataInputStream in = rfs.open(path); IFile.Reader<Text, Text> reader = new IFile.Reader<Text, Text>(conf, in, rfs.getFileStatus(path).getLen(), codec, null); reader.close(); // test check sum byte[] ab= new byte[100]; int readed= reader.checksumIn.readWithChecksum(ab, 0, ab.length); assertEquals( readed,reader.checksumIn.getChecksum().length); }
/** * Test * {@link CompressionEmulationUtil#getPossiblyDecompressedInputStream(Path, * Configuration, long)} * and * {@link CompressionEmulationUtil#getPossiblyCompressedOutputStream(Path, * Configuration)}. */ @Test public void testPossiblyCompressedDecompressedStreams() throws IOException { JobConf conf = new JobConf(); FileSystem lfs = FileSystem.getLocal(conf); String inputLine = "Hi Hello!"; CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true); CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true); conf.setBoolean(FileOutputFormat.COMPRESS, true); conf.setClass(FileOutputFormat.COMPRESS_CODEC, GzipCodec.class, CompressionCodec.class); // define the test's root temp directory Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp")).makeQualified( lfs.getUri(), lfs.getWorkingDirectory()); Path tempDir = new Path(rootTempDir, "TestPossiblyCompressedDecompressedStreams"); lfs.delete(tempDir, true); // create a compressed file Path compressedFile = new Path(tempDir, "test"); OutputStream out = CompressionEmulationUtil.getPossiblyCompressedOutputStream(compressedFile, conf); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out)); writer.write(inputLine); writer.close(); // now read back the data from the compressed stream compressedFile = compressedFile.suffix(".gz"); InputStream in = CompressionEmulationUtil .getPossiblyDecompressedInputStream(compressedFile, conf, 0); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); String readLine = reader.readLine(); assertEquals("Compression/Decompression error", inputLine, readLine); reader.close(); }
/** * For running a few tests of methods herein. * @param args * @throws IOException */ public static void main(String[] args) throws IOException { int count = 1024; int size = 10240; for (String arg: args) { if (arg.startsWith(COUNT)) { count = Integer.parseInt(arg.replace(COUNT, "")); } else if (arg.startsWith(SIZE)) { size = Integer.parseInt(arg.replace(SIZE, "")); } else { usage(1); } } IPCUtil util = new IPCUtil(HBaseConfiguration.create()); ((Log4JLogger)IPCUtil.LOG).getLogger().setLevel(Level.ALL); timerTests(util, count, size, new KeyValueCodec(), null); timerTests(util, count, size, new KeyValueCodec(), new DefaultCodec()); timerTests(util, count, size, new KeyValueCodec(), new GzipCodec()); }
@Test public void readBitcoinRawBlockInputFormatGzipCompressed() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.gz"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length,"Only one split generated for compressed block"); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull( reader,"Format returned null RecordReader"); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue( reader.next(key,block),"Input Split for block version contains at least one block"); assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes"); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse( reader.next(emptyKey,emptyBlock),"No further blocks in compressed block"); reader.close(); }
@Test public void readBitcoinTransactionInputFormatGzipCompressed() throws IOException{ JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.gz"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length,"Only one split generated for compressed block"); RecordReader<BytesWritable, BitcoinTransaction> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull( reader,"Format returned null RecordReader"); BytesWritable key = new BytesWritable(); BitcoinTransaction transaction = new BitcoinTransaction(); int transactCount=0; while (reader.next(key,transaction)) { transactCount++; } assertEquals( 936, transactCount,"Compressed block must have at least 936 transactions"); reader.close(); }
@Test public void readBitcoinRawBlockInputFormatGzipCompressed() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); Job job = Job.getInstance(conf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, conf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.gz"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); List<InputSplit> splits = format.getSplits(job); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); assertEquals( 1, splits.size(),"Only one split generated for compressed block"); RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context); assertNotNull( reader,"Format returned null RecordReader"); reader.initialize(splits.get(0),context); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block"); block=reader.getCurrentValue(); assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes"); assertFalse( reader.nextKeyValue(),"No further blocks in compressed block"); reader.close(); }
@Test public void readBitcoinTransactionInputFormatGzipCompressed() throws IOException, InterruptedException{ Configuration conf = new Configuration(defaultConf); Job job = Job.getInstance(conf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, conf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.gz"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat(); List<InputSplit> splits = format.getSplits(job); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); assertEquals( 1, splits.size(),"Only one split generated for compressed block"); RecordReader<BytesWritable, BitcoinTransaction> reader = format.createRecordReader(splits.get(0), context); assertNotNull( reader,"Format returned null RecordReader"); reader.initialize(splits.get(0),context); int transactCount=0; while (reader.nextKeyValue()) { transactCount++; } assertEquals( 936, transactCount,"Comrpessed block must have at least 936 transactions"); reader.close(); }
@Override public RecordWriter<NullWritable, DynamoDBItemWritable> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null; String extension = ""; DataOutputStream fileOut; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = ReflectionUtils.newInstance(codecClass, job); extension = codec.getDefaultExtension(); } Path file = new Path(FileOutputFormat.getOutputPath(job), name + extension); FileSystem fs = file.getFileSystem(job); if (!isCompressed) { fileOut = fs.create(file, progress); } else { fileOut = new DataOutputStream(codec.createOutputStream(fs.create(file, progress))); } return new ExportRecordWriter(fileOut); }
private int runMetastoreCompareJob(Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(this.getConf(), "Stage1: Metastore Compare Job"); job.setJarByClass(this.getClass()); job.setInputFormatClass(MetastoreScanInputFormat.class); job.setMapperClass(Stage1ProcessTableMapper.class); job.setReducerClass(Stage1PartitionCompareReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
@Test /** * Create an IFile.Writer using GzipCodec since this codec does not * have a compressor when run via the tests (ie no native libraries). */ public void testIFileWriterWithCodec() throws Exception { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.getLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).getRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.setConf(conf); IFile.Writer<Text, Text> writer = new IFile.Writer<Text, Text>(conf, rfs, path, Text.class, Text.class, codec, null); writer.close(); }
@Test /** Same as above but create a reader. */ public void testIFileReaderWithCodec() throws Exception { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.getLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).getRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.setConf(conf); IFile.Writer<Text, Text> writer = new IFile.Writer<Text, Text>(conf, rfs, path, Text.class, Text.class, codec, null); writer.close(); IFile.Reader<Text, Text> reader = new IFile.Reader<Text, Text>(conf, rfs, path, codec, null); reader.close(); }
/** * Construct the preferred type of 'raw' SequenceFile Writer. * @param out The stream on top which the writer is to be constructed. * @param keyClass The 'key' type. * @param valClass The 'value' type. * @param compress Compress data? * @param blockCompress Compress blocks? * @param metadata The metadata of the file. * @return Returns the handle to the constructed SequenceFile Writer. * @throws IOException */ private static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass, boolean compress, boolean blockCompress, CompressionCodec codec, Metadata metadata) throws IOException { if (codec != null && (codec instanceof GzipCodec) && !NativeCodeLoader.isNativeCodeLoaded() && !ZlibFactory.isNativeZlibLoaded(conf)) { throw new IllegalArgumentException("SequenceFile doesn't work with " + "GzipCodec without native-hadoop code!"); } Writer writer = null; if (!compress) { writer = new Writer(conf, out, keyClass, valClass, metadata); } else if (compress && !blockCompress) { writer = new RecordCompressWriter(conf, out, keyClass, valClass, codec, metadata); } else { writer = new BlockCompressWriter(conf, out, keyClass, valClass, codec, metadata); } return writer; }
/** * Construct the preferred type of 'raw' SequenceFile Writer. * @param conf The configuration. * @param out The stream on top which the writer is to be constructed. * @param keyClass The 'key' type. * @param valClass The 'value' type. * @param compressionType The compression type. * @param codec The compression codec. * @param metadata The metadata of the file. * @return Returns the handle to the constructed SequenceFile Writer. * @throws IOException */ public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass, CompressionType compressionType, CompressionCodec codec, Metadata metadata) throws IOException { if ((codec instanceof GzipCodec) && !NativeCodeLoader.isNativeCodeLoaded() && !ZlibFactory.isNativeZlibLoaded(conf)) { throw new IllegalArgumentException("SequenceFile doesn't work with " + "GzipCodec without native-hadoop code!"); } Writer writer = null; if (compressionType == CompressionType.NONE) { writer = new Writer(conf, out, keyClass, valClass, metadata); } else if (compressionType == CompressionType.RECORD) { writer = new RecordCompressWriter(conf, out, keyClass, valClass, codec, metadata); } else if (compressionType == CompressionType.BLOCK){ writer = new BlockCompressWriter(conf, out, keyClass, valClass, codec, metadata); } return writer; }
@Override public RecordWriter<LongWritable, Document> getRecordWriter( TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass( job, GzipCodec.class); codec = ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } Path path = getDefaultWorkFile(job, extension); FileSystem fs = path.getFileSystem(conf); FSDataOutputStream out = fs.create(path, false); if (!isCompressed) { return new JSONFileOutputRecordWriter(out); } else { return new JSONFileOutputRecordWriter(new DataOutputStream( codec.createOutputStream(out))); } }
@Test /** * Create an IFile.Writer using GzipCodec since this code does not * have a compressor when run via the tests (ie no native libraries). */ public void testIFileWriterWithCodec() throws Exception { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.getLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).getRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.setConf(conf); IFile.Writer<Text, Text> writer = new IFile.Writer<Text, Text>(conf, rfs, path, Text.class, Text.class, codec, null); writer.close(); }
@Test /** Same as above but create a reader. */ public void testIFileReaderWithCodec() throws Exception { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.getLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).getRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.setConf(conf); IFile.Writer<Text, Text> writer = new IFile.Writer<Text, Text>(conf, rfs, path, Text.class, Text.class, codec, null); writer.close(); IFile.Reader<Text, Text> reader = new IFile.Reader<Text, Text>(conf, rfs, path, codec, null); reader.close(); // test check sum byte[] ab= new byte[100]; int readed= reader.checksumIn.readWithChecksum(ab, 0, ab.length); assertEquals( readed,reader.checksumIn.getChecksum().length); }
public int run(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("mapreduce.output.textoutputformat.separator","|"); conf.setBoolean("mapreduce.output.compress", true); conf.setBoolean("mapreduce.output.fileoutputformat.compress", true); conf.setClass("mapreduce.map.output.compression.codec",GzipCodec.class,CompressionCodec.class); Job job = Job.getInstance(conf, "clusterCombine"); job.setJarByClass(ClusterCombineMR.class); job.setMapperClass(ClusterMapper.class); job.setReducerClass(ClusterReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); FileOutputFormat.setCompressOutput(job, false); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); return(job.waitForCompletion(true) ?0:1); }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "geolitelocation"); job.setJarByClass(GeoLiteLocation.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setReducerClass(Reducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "geoipcountry"); job.setJarByClass(GeoIPCountry.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setReducerClass(Reducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "loadipdns"); job.setJarByClass(LoadIPDns.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setReducerClass(Reducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(100); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
public static void main(String[] args) throws Exception { final String NAME_NODE = "hdfs://hadoop-master:9000"; final int partition_no = 148; Configuration conf = new Configuration(); Job job = new Job(conf, "uniqueips"); job.setJarByClass(PrepareSflows.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setPartitionerClass(CustomPartitioner.class); DistributedCache.addCacheFile(new URI(NAME_NODE + "/user/root/" + args[2]), job.getConfiguration()); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setNumReduceTasks(partition_no * partition_no); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }