/** * Initializes the Mapper, and sets input parameters for the job. All of * the records in the dataStore are used as the input. If you want to * include a specific subset, use one of the overloaded methods which takes * query parameter. * @param job the job to set the properties for * @param dataStoreClass the datastore class * @param inKeyClass Map input key class * @param inValueClass Map input value class * @param outKeyClass Map output key class * @param outValueClass Map output value class * @param mapperClass the mapper class extending GoraMapper * @param partitionerClass optional partitioner class * @param reuseObjects whether to reuse objects in serialization */ @SuppressWarnings("rawtypes") public static <K1, V1 extends Persistent, K2, V2> void initMapperJob( Job job, Class<? extends DataStore<K1,V1>> dataStoreClass, Class<K1> inKeyClass, Class<V1> inValueClass, Class<K2> outKeyClass, Class<V2> outValueClass, Class<? extends GoraMapper> mapperClass, Class<? extends Partitioner> partitionerClass, boolean reuseObjects) throws IOException { //set the input via GoraInputFormat GoraInputFormat.setInput(job, dataStoreClass, inKeyClass, inValueClass, reuseObjects); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(outKeyClass); job.setMapOutputValueClass(outValueClass); if (partitionerClass != null) { job.setPartitionerClass(partitionerClass); } }
/** * Initializes the Mapper, and sets input parameters for the job * @param job the job to set the properties for * @param query the query to get the inputs from * @param dataStore the datastore as the input * @param outKeyClass Map output key class * @param outValueClass Map output value class * @param mapperClass the mapper class extending GoraMapper * @param partitionerClass optional partitioner class * @param reuseObjects whether to reuse objects in serialization */ @SuppressWarnings("rawtypes") public static <K1, V1 extends Persistent, K2, V2> void initMapperJob( Job job, Query<K1,V1> query, DataStore<K1,V1> dataStore, Class<K2> outKeyClass, Class<V2> outValueClass, Class<? extends GoraMapper> mapperClass, Class<? extends Partitioner> partitionerClass, boolean reuseObjects) throws IOException { //set the input via GoraInputFormat GoraInputFormat.setInput(job, query, dataStore, reuseObjects); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(outKeyClass); job.setMapOutputValueClass(outValueClass); if (partitionerClass != null) { job.setPartitionerClass(partitionerClass); } }
/** * Extracts package-private TeraSort total order partitioner class. * * @return The class. */ @SuppressWarnings("unchecked") private Class<? extends Partitioner> getTeraSortTotalOrderPartitioner() { Class[] classes = TeraSort.class.getDeclaredClasses(); Class<? extends Partitioner> totalOrderPartitionerCls = null; for (Class<?> x: classes) { if ("TotalOrderPartitioner".equals(x.getSimpleName())) { totalOrderPartitionerCls = (Class<? extends Partitioner>)x; break; } } if (totalOrderPartitionerCls == null) throw new IllegalStateException("Failed to find TeraSort total order partitioner class."); return totalOrderPartitionerCls; }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; } Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); Reader[] readers = MapFileOutputFormat.getReaders(path, getConf()); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); Writable entry = MapFileOutputFormat.getEntry(readers, partitioner, key, val); if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); return 0; }
/** * Initializes the Mapper, and sets input parameters for the job. All of * the records in the dataStore are used as the input. If you want to * include a specific subset, use one of the overloaded methods which takes * query parameter. * * @param job the job to set the properties for * @param dataStoreClass the datastore class * @param inKeyClass Map input key class * @param inValueClass Map input value class * @param outKeyClass Map output key class * @param outValueClass Map output value class * @param mapperClass the mapper class extending GoraMapper * @param partitionerClass optional partitioner class * @param reuseObjects whether to reuse objects in serialization * @param <K1> Map input key class * @param <V1> Map input value class * @param <K2> Map output key class * @param <V2> Map output value class * @throws IOException if there is an error initializing the Map job */ @SuppressWarnings("rawtypes") public static <K1, V1 extends Persistent, K2, V2> void initMapperJob( Job job, Class<? extends DataStore<K1,V1>> dataStoreClass, Class<K1> inKeyClass, Class<V1> inValueClass, Class<K2> outKeyClass, Class<V2> outValueClass, Class<? extends GoraMapper> mapperClass, Class<? extends Partitioner> partitionerClass, boolean reuseObjects) throws IOException { //set the input via GoraInputFormat GoraInputFormat.setInput(job, dataStoreClass, inKeyClass, inValueClass, reuseObjects); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(outKeyClass); job.setMapOutputValueClass(outValueClass); if (partitionerClass != null) { job.setPartitionerClass(partitionerClass); } }
/** * Initializes the Mapper, and sets input parameters for the job * * @param job the job to set the properties for * @param query the query to get the inputs from * @param outKeyClass Map output key class * @param outValueClass Map output value class * @param mapperClass the mapper class extending GoraMapper * @param partitionerClass optional partitioner class * @param reuseObjects whether to reuse objects in serialization * @param <K1> Map input key class * @param <V1> Map input value class * @param <K2> Map output key class * @param <V2> Map output value class * @throws IOException if there is an error initializing the Map job */ @SuppressWarnings("rawtypes") public static <K1, V1 extends Persistent, K2, V2> void initMapperJob( Job job, Query<K1,V1> query, Class<K2> outKeyClass, Class<V2> outValueClass, Class<? extends GoraMapper> mapperClass, Class<? extends Partitioner> partitionerClass, boolean reuseObjects) throws IOException { //set the input via GoraInputFormat GoraInputFormat.setInput(job, query, reuseObjects); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(outKeyClass); job.setMapOutputValueClass(outValueClass); if (partitionerClass != null) { job.setPartitionerClass(partitionerClass); } }
private void assertData(int totalShardCount) throws IOException { Partitioner<IntWritable, IntWritable> partitioner = new HashPartitioner<IntWritable, IntWritable>(); for (int i = 0; i < totalShardCount; i++) { HdfsDirectory directory = new HdfsDirectory(configuration, new Path(path, ShardUtil.getShardName(i))); DirectoryReader reader = DirectoryReader.open(directory); int numDocs = reader.numDocs(); for (int d = 0; d < numDocs; d++) { Document document = reader.document(d); IndexableField field = document.getField("id"); Integer id = (Integer) field.numericValue(); int partition = partitioner.getPartition(new IntWritable(id), null, totalShardCount); assertEquals(i, partition); } reader.close(); } }
private static void createShard(Configuration configuration, int i, Path path, int totalShardCount) throws IOException { HdfsDirectory hdfsDirectory = new HdfsDirectory(configuration, path); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new KeywordAnalyzer()); TieredMergePolicy mergePolicy = (TieredMergePolicy) conf.getMergePolicy(); mergePolicy.setUseCompoundFile(false); IndexWriter indexWriter = new IndexWriter(hdfsDirectory, conf); Partitioner<IntWritable, IntWritable> partitioner = new HashPartitioner<IntWritable, IntWritable>(); int partition = partitioner.getPartition(new IntWritable(i), null, totalShardCount); assertEquals(i, partition); Document doc = getDoc(i); indexWriter.addDocument(doc); indexWriter.close(); }
protected final GroupingOptions groupingOptions( Class<? extends Partitioner> partitionerClass, Class<? extends RawComparator<?>> groupingComparator, Class<? extends RawComparator<?>> sortComparator) { GroupingOptions.Builder b = GroupingOptions.builder() .partitionerClass(partitionerClass) .numReducers(getNumReducers()); if (groupingComparator != null) { b.groupingComparatorClass(groupingComparator); } if (sortComparator != null) { b.sortComparatorClass(sortComparator); } return b.build(); }
/** Get an entry from output generated by this class. */ public static <K extends WritableComparable<?>, V extends Writable> Writable getEntry(MapFile.Reader[] readers, Partitioner<K, V> partitioner, K key, V value) throws IOException { int part = partitioner.getPartition(key, value, readers.length); return readers[part].get(key, value); }
/** * Get the {@link Partitioner} class for the job. * * @return the {@link Partitioner} class for the job. */ @SuppressWarnings("unchecked") public Class<? extends Partitioner<?,?>> getPartitionerClass() throws ClassNotFoundException { return (Class<? extends Partitioner<?,?>>) conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class); }
/** Get an entry from output generated by this class. */ public static <K extends WritableComparable<?>, V extends Writable> Writable getEntry(MapFile.Reader[] readers, Partitioner<K, V> partitioner, K key, V value) throws IOException { int readerLength = readers.length; int part; if (readerLength <= 1) { part = 0; } else { part = partitioner.getPartition(key, value, readers.length); } return readers[part].get(key, value); }
public Job createJob( Configuration configuration, int numberOfNodes, long currentGenerationNumber, String generationNameFormat, Path currentGenerationsBlockReportsFolderPath, Schema individualWrapperSchema ) throws IOException { // Creates a job. Job job = super.createJob(configuration, numberOfNodes, currentGenerationNumber, currentGenerationNumber, (currentGenerationNumber - 1L), currentGenerationNumber, generationNameFormat, currentGenerationsBlockReportsFolderPath, individualWrapperSchema, GlobalMapper.class, Partitioner.class, Reducer.class); // Sets the input. NodesInputFormat.setInputPopulationFolderPath(job, this.getInputFolderPath()); NodesInputFormat.activateInitialisation(job, false); // Configures the fitness value class. job.getConfiguration().setClass(Constants.CONFIGURATION_FITNESS_VALUE_CLASS, this.fitnessValueClass, FitnessValue.class); // Configures the Fitness Evaluation phase. job.getConfiguration().setClass(Constants.CONFIGURATION_FITNESS_EVALUATION_CLASS, this.fitnessEvaluationClass, FitnessEvaluation.class); // Disables the reducer. job.setNumReduceTasks(0); // Returns the job. return job; }
/** * If > hadoop 0.20, then we want to use the hadoop TotalOrderPartitioner. * If 0.20, then we want to use the TOP that we have under hadoopbackport. * This method is about hbase being able to run on different versions of * hadoop. In 0.20.x hadoops, we have to use the TOP that is bundled with * hbase. Otherwise, we use the one in Hadoop. * @return Instance of the TotalOrderPartitioner class * @throws ClassNotFoundException If can't find a TotalOrderPartitioner. */ private static Class<? extends Partitioner> getTotalOrderPartitionerClass() throws ClassNotFoundException { Class<? extends Partitioner> clazz = null; try { clazz = (Class<? extends Partitioner>) Class.forName("org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner"); } catch (ClassNotFoundException e) { clazz = (Class<? extends Partitioner>) Class.forName("org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner"); } return clazz; }
private void setupAccumuloPartitionerWithGivenPartitioner(final Class<? extends Partitioner> partitioner) throws IOException { // Given final JobConf localConf = createLocalConf(); final FileSystem fs = FileSystem.getLocal(localConf); fs.mkdirs(new Path(outputDir)); fs.mkdirs(new Path(splitsDir)); try (final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(splitsFile), true)))) { writer.write("1"); } final AccumuloAddElementsFromHdfsJobFactory factory = new AccumuloAddElementsFromHdfsJobFactory(); final Job job = mock(Job.class); final AddElementsFromHdfs operation = new AddElementsFromHdfs.Builder() .outputPath(outputDir) .partitioner(partitioner) .useProvidedSplits(true) .splitsFilePath(splitsFile) .build(); final AccumuloStore store = mock(AccumuloStore.class); given(job.getConfiguration()).willReturn(localConf); // When factory.setupJob(job, operation, TextMapperGeneratorImpl.class.getName(), store); // Then if (NoPartitioner.class.equals(partitioner)) { verify(job, never()).setNumReduceTasks(Mockito.anyInt()); verify(job, never()).setPartitionerClass(Mockito.any(Class.class)); assertNull(job.getConfiguration().get(GafferRangePartitioner.class.getName() + ".cutFile")); } else { verify(job).setNumReduceTasks(2); verify(job).setPartitionerClass(GafferKeyRangePartitioner.class); assertEquals(splitsFile, job.getConfiguration().get(GafferRangePartitioner.class.getName() + ".cutFile")); } }
@Test public void shouldJSONSerialiseAndDeserialise() throws SerialisationException { // Given final Map<String, String> inputMapperPairs = new HashMap<>(); inputMapperPairs.put("inputPath", MapperGenerator.class.getName()); final AddElementsFromHdfs addElements = new AddElementsFromHdfs.Builder() .inputMapperPairs(inputMapperPairs) .outputPath("outputPath") .failurePath("failurePath") .jobInitialiser(new TextJobInitialiser()) .partitioner(Partitioner.class) .mappers(5) .reducers(10) .splitsFilePath("/path/to/splits/file") .useProvidedSplits(false) .build(); // When String json = new String(JSONSerialiser.serialise(addElements, true)); // Then JsonAssert.assertEquals(String.format("{%n" + " \"class\" : \"uk.gov.gchq.gaffer.hdfs.operation.AddElementsFromHdfs\",%n" + " \"failurePath\" : \"failurePath\",%n" + " \"validate\" : true,%n" + " \"inputMapperPairs\" : { \"inputPath\" :\"uk.gov.gchq.gaffer.hdfs.operation.mapper.generator.MapperGenerator\"},%n" + " \"outputPath\" : \"outputPath\",%n" + " \"jobInitialiser\" : {%n" + " \"class\" : \"uk.gov.gchq.gaffer.hdfs.operation.handler.job.initialiser.TextJobInitialiser\"%n" + " },%n" + " \"numMapTasks\" : 5,%n" + " \"numReduceTasks\" : 10,%n" + " \"splitsFilePath\" : \"/path/to/splits/file\",%n" + " \"partitioner\" : \"org.apache.hadoop.mapreduce.Partitioner\"%n" + "}"), json); }
/** * Validates the first non-empty partition hfile has right partitioning function. * It reads several keys, then calculates the partition according to the partitioning function * client offering. If the calculated partition number is different with actual partition number * an exception is thrown. If all partition hfiles are empty, an exception is thrown. * * @param parts full absolute path for all partitions * @param partitionerType type of paritioning function * @param numShards total number of partitions * @throws IOException if something goes wrong when reading the hfiles * @throws IllegalArgumentException if the partitioner type is wrong or all partitions are empty */ public void validate(List<Path> parts, PartitionerType partitionerType, int numShards) throws IOException { boolean hasNonEmptyPartition = false; HColumnDescriptor columnDescriptor = new HColumnDescriptor(); // Disable block cache to ensure it reads the actual file content. columnDescriptor.setBlockCacheEnabled(false); for (int shardIndex = 0; shardIndex < parts.size(); shardIndex++) { Path fileToBeValidated = parts.get(shardIndex); HFile.Reader reader = null; try { FileSystem fs = FileSystem.newInstance(fileToBeValidated.toUri(), conf); CacheConfig cc = new CacheConfig(conf, columnDescriptor); reader = HFile.createReader(fs, fileToBeValidated, cc); Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType); byte[] rowKey = reader.getFirstRowKey(); if (rowKey == null) { LOG.warn(String.format("empty partition %s", fileToBeValidated.toString())); reader.close(); continue; } hasNonEmptyPartition = true; BytesWritable key = new BytesWritable(rowKey); int partition = partitioner.getPartition(key, null, numShards); if (partition != shardIndex) { throw new IllegalArgumentException( String.format("wrong partition type %s for key %s in partition %d, expected %d", partitionerType.toString(), new String(key.getBytes()), shardIndex, partition) ); } } finally { if (reader != null) { reader.close(); } } } if (!hasNonEmptyPartition) { throw new IllegalArgumentException("all partitions are empty"); } }
/** * Get the partitioner. If shardFunction is "ShardFunction.CASCADING", return * CascadingPartitioner. Otherwise, return HashPartitioner. */ public static Partitioner getPartitioner(PartitionerType type) { if (type.equals(PartitionerType.CASCADING)) { return CASCADING_PARTITIONER; } else if (type.equals(PartitionerType.MODULUS)) { return HASH_PARTITIONER; } else { throw new RuntimeException("Unsupported ShardFunction." + type); } }
public static String getPartitionName(ByteBuffer key, PartitionerType partitionerType, int numPartitions) { Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType); return Integer.toString( partitioner.getPartition( new BytesWritable(BytesUtil.readBytesFromByteBufferWithoutConsume(key)), null, numPartitions)); }
/** * Generate hfiles for testing purpose * * @param sourceFileSystem source file system * @param conf configuration for hfile * @param outputFolder output folder for generated hfiles * @param partitionerType partitioner type * @param numOfPartitions number of partitions * @param numOfKeys number of keys * @return list of generated hfiles * @throws IOException if hfile creation goes wrong */ public static List<Path> generateHFiles(FileSystem sourceFileSystem, Configuration conf, File outputFolder, PartitionerType partitionerType, int numOfPartitions, int numOfKeys) throws IOException { StoreFile.Writer[] writers = new StoreFile.Writer[numOfPartitions]; for (int i = 0; i < numOfPartitions; i++) { writers[i] = new StoreFile.WriterBuilder(conf, new CacheConfig(conf), sourceFileSystem, 4096) .withFilePath(new Path(String.format("%s/%s", outputFolder.getAbsoluteFile(), TerrapinUtil.formatPartitionName(i)))) .withCompression(Compression.Algorithm.NONE) .build(); } Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType); for (int i = 0; i < numOfKeys; i++) { byte[] key = String.format("%06d", i).getBytes(); byte[] value; if (i <= 1) { value = "".getBytes(); } else { value = ("v" + (i + 1)).getBytes(); } KeyValue kv = new KeyValue(key, Bytes.toBytes("cf"), Bytes.toBytes(""), value); int partition = partitioner.getPartition(new BytesWritable(key), new BytesWritable(value), numOfPartitions); writers[partition].append(kv); } for (int i = 0; i < numOfPartitions; i++) { writers[i].close(); } return Lists.transform(Lists.newArrayList(writers), new Function<StoreFile.Writer, Path>() { @Override public Path apply(StoreFile.Writer writer) { return writer.getPath(); } }); }
public ITuplePartitionComputerFactory getTuplePartitionComputer() throws HyracksDataException { int nReducers = job.getNumReduceTasks(); try { return new HadoopNewPartitionerTuplePartitionComputerFactory<Writable, Writable>( (Class<? extends Partitioner<Writable, Writable>>) job.getPartitionerClass(), (ISerializerDeserializer<Writable>) DatatypeHelper .createSerializerDeserializer((Class<? extends Writable>) job.getMapOutputKeyClass()), (ISerializerDeserializer<Writable>) DatatypeHelper .createSerializerDeserializer((Class<? extends Writable>) job.getMapOutputValueClass())); } catch (ClassNotFoundException e) { throw new HyracksDataException(e); } }
@SuppressWarnings("unchecked") @Override public void setConf(Configuration conf) { this.conf = conf; Class<Partitioner>[] partitionersClass = (Class<Partitioner>[]) conf.getClasses(CONF_KEY); partitioners = new ArrayList<Partitioner<Object, Object>>(partitionersClass.length); for (Class<Partitioner> partitionerClass : partitionersClass) { partitioners.add(ReflectionUtils.newInstance(partitionerClass, conf)); } numReducers = Ints.asList(conf.getInts(NUM_REDUCERS_KEY)); }