public ReduceContextImpl(Configuration conf, TaskAttemptID taskid, RawKeyValueIterator input, Counter inputKeyCounter, Counter inputValueCounter, RecordWriter<KEYOUT,VALUEOUT> output, OutputCommitter committer, StatusReporter reporter, RawComparator<KEYIN> comparator, Class<KEYIN> keyClass, Class<VALUEIN> valueClass ) throws InterruptedException, IOException{ super(conf, taskid, output, committer, reporter); this.input = input; this.inputKeyCounter = inputKeyCounter; this.inputValueCounter = inputValueCounter; this.comparator = comparator; this.serializationFactory = new SerializationFactory(conf); this.keyDeserializer = serializationFactory.getDeserializer(keyClass); this.keyDeserializer.open(buffer); this.valueDeserializer = serializationFactory.getDeserializer(valueClass); this.valueDeserializer.open(buffer); hasMore = input.next(); this.keyClass = keyClass; this.valueClass = valueClass; this.conf = conf; this.taskid = taskid; }
public ValuesIterator (RawKeyValueIterator in, RawComparator<KEY> comparator, Class<KEY> keyClass, Class<VALUE> valClass, Configuration conf, Progressable reporter) throws IOException { this.in = in; this.comparator = comparator; this.reporter = reporter; SerializationFactory serializationFactory = new SerializationFactory(conf); this.keyDeserializer = serializationFactory.getDeserializer(keyClass); this.keyDeserializer.open(keyIn); this.valDeserializer = serializationFactory.getDeserializer(valClass); this.valDeserializer.open(this.valueIn); readNextKey(); key = nextKey; nextKey = null; // force new instance creation hasNext = more; }
@SuppressWarnings("unchecked") protected OldCombinerRunner(Class<? extends Reducer<K,V,K,V>> cls, JobConf conf, Counters.Counter inputCounter, TaskReporter reporter) { super(inputCounter, conf, reporter); combinerClass = cls; keyClass = (Class<K>) job.getMapOutputKeyClass(); valueClass = (Class<V>) job.getMapOutputValueClass(); comparator = (RawComparator<K>) job.getCombinerKeyGroupingComparator(); }
@SuppressWarnings("unchecked") NewCombinerRunner(Class reducerClass, JobConf job, org.apache.hadoop.mapreduce.TaskAttemptID taskId, org.apache.hadoop.mapreduce.TaskAttemptContext context, Counters.Counter inputCounter, TaskReporter reporter, org.apache.hadoop.mapreduce.OutputCommitter committer) { super(inputCounter, job, reporter); this.reducerClass = reducerClass; this.taskId = taskId; keyClass = (Class<K>) context.getMapOutputKeyClass(); valueClass = (Class<V>) context.getMapOutputValueClass(); comparator = (RawComparator<K>) context.getCombinerKeyGroupingComparator(); this.committer = committer; }
public static <K extends Object, V extends Object> RawKeyValueIterator merge(Configuration conf, FileSystem fs, Class<K> keyClass, Class<V> valueClass, CompressionCodec codec, Path[] inputs, boolean deleteInputs, int mergeFactor, Path tmpDir, RawComparator<K> comparator, Progressable reporter, Counters.Counter readsCounter, Counters.Counter writesCounter, Progress mergePhase) throws IOException { return new MergeQueue<K, V>(conf, fs, inputs, deleteInputs, codec, comparator, reporter, null, TaskType.REDUCE).merge(keyClass, valueClass, mergeFactor, tmpDir, readsCounter, writesCounter, mergePhase); }
public static <K extends Object, V extends Object> RawKeyValueIterator merge(Configuration conf, FileSystem fs, Class<K> keyClass, Class<V> valueClass, CompressionCodec codec, Path[] inputs, boolean deleteInputs, int mergeFactor, Path tmpDir, RawComparator<K> comparator, Progressable reporter, Counters.Counter readsCounter, Counters.Counter writesCounter, Counters.Counter mergedMapOutputsCounter, Progress mergePhase) throws IOException { return new MergeQueue<K, V>(conf, fs, inputs, deleteInputs, codec, comparator, reporter, mergedMapOutputsCounter, TaskType.REDUCE).merge( keyClass, valueClass, mergeFactor, tmpDir, readsCounter, writesCounter, mergePhase); }
public static <K extends Object, V extends Object> RawKeyValueIterator merge(Configuration conf, FileSystem fs, Class<K> keyClass, Class<V> valueClass, List<Segment<K, V>> segments, int mergeFactor, Path tmpDir, RawComparator<K> comparator, Progressable reporter, Counters.Counter readsCounter, Counters.Counter writesCounter, Progress mergePhase) throws IOException { return merge(conf, fs, keyClass, valueClass, segments, mergeFactor, tmpDir, comparator, reporter, false, readsCounter, writesCounter, mergePhase); }
public static <K extends Object, V extends Object> RawKeyValueIterator merge(Configuration conf, FileSystem fs, Class<K> keyClass, Class<V> valueClass, List<Segment<K, V>> segments, int mergeFactor, Path tmpDir, RawComparator<K> comparator, Progressable reporter, boolean sortSegments, Counters.Counter readsCounter, Counters.Counter writesCounter, Progress mergePhase) throws IOException { return new MergeQueue<K, V>(conf, fs, segments, comparator, reporter, sortSegments, TaskType.REDUCE).merge(keyClass, valueClass, mergeFactor, tmpDir, readsCounter, writesCounter, mergePhase); }
public static <K extends Object, V extends Object> RawKeyValueIterator merge(Configuration conf, FileSystem fs, Class<K> keyClass, Class<V> valueClass, CompressionCodec codec, List<Segment<K, V>> segments, int mergeFactor, Path tmpDir, RawComparator<K> comparator, Progressable reporter, boolean sortSegments, Counters.Counter readsCounter, Counters.Counter writesCounter, Progress mergePhase, TaskType taskType) throws IOException { return new MergeQueue<K, V>(conf, fs, segments, comparator, reporter, sortSegments, codec, taskType).merge(keyClass, valueClass, mergeFactor, tmpDir, readsCounter, writesCounter, mergePhase); }
public static <K extends Object, V extends Object> RawKeyValueIterator merge(Configuration conf, FileSystem fs, Class<K> keyClass, Class<V> valueClass, List<Segment<K, V>> segments, int mergeFactor, int inMemSegments, Path tmpDir, RawComparator<K> comparator, Progressable reporter, boolean sortSegments, Counters.Counter readsCounter, Counters.Counter writesCounter, Progress mergePhase) throws IOException { return new MergeQueue<K, V>(conf, fs, segments, comparator, reporter, sortSegments, TaskType.REDUCE).merge(keyClass, valueClass, mergeFactor, inMemSegments, tmpDir, readsCounter, writesCounter, mergePhase); }
public static <K extends Object, V extends Object> RawKeyValueIterator merge(Configuration conf, FileSystem fs, Class<K> keyClass, Class<V> valueClass, CompressionCodec codec, List<Segment<K, V>> segments, int mergeFactor, int inMemSegments, Path tmpDir, RawComparator<K> comparator, Progressable reporter, boolean sortSegments, Counters.Counter readsCounter, Counters.Counter writesCounter, Progress mergePhase) throws IOException { return new MergeQueue<K, V>(conf, fs, segments, comparator, reporter, sortSegments, codec, TaskType.REDUCE).merge(keyClass, valueClass, mergeFactor, inMemSegments, tmpDir, readsCounter, writesCounter, mergePhase); }
public SkippingReduceValuesIterator(RawKeyValueIterator in, RawComparator<KEY> comparator, Class<KEY> keyClass, Class<VALUE> valClass, Configuration conf, TaskReporter reporter, TaskUmbilicalProtocol umbilical) throws IOException { super(in, comparator, keyClass, valClass, conf, reporter); this.umbilical = umbilical; this.skipGroupCounter = reporter.getCounter(TaskCounter.REDUCE_SKIPPED_GROUPS); this.skipRecCounter = reporter.getCounter(TaskCounter.REDUCE_SKIPPED_RECORDS); this.toWriteSkipRecs = toWriteSkipRecs() && SkipBadRecords.getSkipOutputPath(conf)!=null; this.keyClass = keyClass; this.valClass = valClass; this.reporter = reporter; skipIt = getSkipRanges().skipRangeIterator(); mayBeSkip(); }
static BlockIndex readIndex(final RawComparator<byte []> c, DataInputStream in, final int indexSize) throws IOException { BlockIndex bi = new BlockIndex(c); bi.blockOffsets = new long[indexSize]; bi.blockKeys = new byte[indexSize][]; bi.blockDataSizes = new int[indexSize]; // If index size is zero, no index was written. if (indexSize > 0) { byte [] magic = new byte[INDEXBLOCKMAGIC.length]; in.readFully(magic); if (!Arrays.equals(magic, INDEXBLOCKMAGIC)) { throw new IOException("Index block magic is wrong: " + Arrays.toString(magic)); } for (int i = 0; i < indexSize; ++i ) { long offset = in.readLong(); int dataSize = in.readInt(); byte [] key = Bytes.readByteArray(in); bi.add(key, offset, dataSize); } } return bi; }
/** * Binary search for keys in indexes. * * @param arr array of byte arrays to search for * @param key the key you want to find * @param offset the offset in the key you want to find * @param length the length of the key * @param comparator a comparator to compare. * @return zero-based index of the key, if the key is present in the array. * Otherwise, a value -(i + 1) such that the key is between arr[i - * 1] and arr[i] non-inclusively, where i is in [0, i], if we define * arr[-1] = -Inf and arr[N] = Inf for an N-element array. The above * means that this function can return 2N + 1 different values * ranging from -(N + 1) to N - 1. */ public static int binarySearch(byte [][]arr, byte []key, int offset, int length, RawComparator<?> comparator) { int low = 0; int high = arr.length - 1; while (low <= high) { int mid = (low+high) >>> 1; // we have to compare in this order, because the comparator order // has special logic when the 'left side' is a special key. int cmp = comparator.compare(key, offset, length, arr[mid], 0, arr[mid].length); // key lives above the midpoint if (cmp > 0) low = mid + 1; // key lives below the midpoint else if (cmp < 0) high = mid - 1; // BAM. how often does this really happen? else return mid; } return - (low+1); }
/** * Binary search for keys in indexes. * * @param arr array of byte arrays to search for * @param key the key you want to find * @param comparator a comparator to compare. * @return zero-based index of the key, if the key is present in the array. * Otherwise, a value -(i + 1) such that the key is between arr[i - * 1] and arr[i] non-inclusively, where i is in [0, i], if we define * arr[-1] = -Inf and arr[N] = Inf for an N-element array. The above * means that this function can return 2N + 1 different values * ranging from -(N + 1) to N - 1. * @return the index of the block */ public static int binarySearch(byte[][] arr, Cell key, RawComparator<Cell> comparator) { int low = 0; int high = arr.length - 1; KeyValue.KeyOnlyKeyValue r = new KeyValue.KeyOnlyKeyValue(); while (low <= high) { int mid = (low+high) >>> 1; // we have to compare in this order, because the comparator order // has special logic when the 'left side' is a special key. r.setKey(arr[mid], 0, arr[mid].length); int cmp = comparator.compare(key, r); // key lives above the midpoint if (cmp > 0) low = mid + 1; // key lives below the midpoint else if (cmp < 0) high = mid - 1; // BAM. how often does this really happen? else return mid; } return - (low+1); }
private void combineAndSpill( RawKeyValueIterator kvIter, Counters.Counter inCounter) throws IOException { JobConf job = jobConf; Reducer combiner = ReflectionUtils.newInstance(combinerClass, job); Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass(); Class<V> valClass = (Class<V>) job.getMapOutputValueClass(); RawComparator<K> comparator = (RawComparator<K>)job.getCombinerKeyGroupingComparator(); try { CombineValuesIterator values = new CombineValuesIterator( kvIter, comparator, keyClass, valClass, job, Reporter.NULL, inCounter); while (values.more()) { combiner.reduce(values.getKey(), values, combineCollector, Reporter.NULL); values.nextKey(); } } finally { combiner.close(); } }
@SuppressWarnings("unchecked") static BytesComparator makeComparator(String comparator) { if (comparator.length() == 0) { // unsorted keys return null; } if (comparator.equals(COMPARATOR_MEMCMP)) { // default comparator return new BytesComparator(new MemcmpRawComparator()); } else if (comparator.startsWith(COMPARATOR_JCLASS)) { String compClassName = comparator.substring(COMPARATOR_JCLASS.length()).trim(); try { Class compClass = Class.forName(compClassName); // use its default ctor to create an instance return new BytesComparator((RawComparator<Object>) compClass .newInstance()); } catch (Exception e) { throw new IllegalArgumentException( "Failed to instantiate comparator: " + comparator + "(" + e.toString() + ")"); } } else { throw new IllegalArgumentException("Unsupported comparator: " + comparator); } }
@SuppressWarnings("unchecked") public RawKeyValueIterator finish() throws Throwable { // merge config params Class<K> keyClass = (Class<K>) jobConf.getMapOutputKeyClass(); Class<V> valueClass = (Class<V>) jobConf.getMapOutputValueClass(); final RawComparator<K> comparator = (RawComparator<K>) jobConf.getOutputKeyComparator(); // Wait for on-going merges to complete merger.close(); LOG.info("finalMerge called with " + segmentsToBeMerged.size() + " on-disk map-outputs"); List<Segment<K, V>> segments = new ArrayList<Segment<K, V>>(); long onDiskBytes = 0; for (Segment<K, V> segment : segmentsToBeMerged) { long fileLength = segment.getLength(); onDiskBytes += fileLength; LOG.debug("Disk file: " + segment + " Length is " + fileLength); segments.add(segment); } segmentsToBeMerged.clear(); LOG.info("Merging " + segmentsToBeMerged.size() + " files, " + onDiskBytes + " bytes from disk"); Collections.sort(segments, new Comparator<Segment<K, V>>() { public int compare(Segment<K, V> o1, Segment<K, V> o2) { if (o1.getLength() == o2.getLength()) { return 0; } return o1.getLength() < o2.getLength() ? -1 : 1; } }); return Merger.merge(jobConf, lustrefs, keyClass, valueClass, segments, segments.size(), mergeTempDir, comparator, reporter, spilledRecordsCounter, null, null); }
public void testTotalOrderCustomComparator() throws Exception { TotalOrderPartitioner<Text,NullWritable> partitioner = new TotalOrderPartitioner<Text,NullWritable>(); Configuration conf = new Configuration(); Text[] revSplitStrings = Arrays.copyOf(splitStrings, splitStrings.length); Arrays.sort(revSplitStrings, new ReverseStringComparator()); Path p = TestTotalOrderPartitioner.<Text>writePartitionFile( "totalordercustomcomparator", conf, revSplitStrings); conf.setBoolean(TotalOrderPartitioner.NATURAL_ORDER, false); conf.setClass(MRJobConfig.MAP_OUTPUT_KEY_CLASS, Text.class, Object.class); conf.setClass(MRJobConfig.KEY_COMPARATOR, ReverseStringComparator.class, RawComparator.class); ArrayList<Check<Text>> revCheck = new ArrayList<Check<Text>>(); revCheck.add(new Check<Text>(new Text("aaaaa"), 9)); revCheck.add(new Check<Text>(new Text("aaabb"), 9)); revCheck.add(new Check<Text>(new Text("aabbb"), 9)); revCheck.add(new Check<Text>(new Text("aaaaa"), 9)); revCheck.add(new Check<Text>(new Text("babbb"), 8)); revCheck.add(new Check<Text>(new Text("baabb"), 8)); revCheck.add(new Check<Text>(new Text("yai"), 1)); revCheck.add(new Check<Text>(new Text("yak"), 1)); revCheck.add(new Check<Text>(new Text("z"), 0)); revCheck.add(new Check<Text>(new Text("ddngo"), 4)); revCheck.add(new Check<Text>(new Text("hi"), 3)); try { partitioner.setConf(conf); NullWritable nw = NullWritable.get(); for (Check<Text> chk : revCheck) { assertEquals(chk.data.toString(), chk.part, partitioner.getPartition(chk.data, nw, splitStrings.length + 1)); } } finally { p.getFileSystem(conf).delete(p, true); } }
public CombineValuesIterator(RawKeyValueIterator in, RawComparator<KEY> comparator, Class<KEY> keyClass, Class<VALUE> valClass, Configuration conf, Reporter reporter, Counters.Counter combineInputCounter) throws IOException { super(in, comparator, keyClass, valClass, conf, reporter); this.combineInputCounter = combineInputCounter; }
@SuppressWarnings("unchecked") protected static <INKEY,INVALUE,OUTKEY,OUTVALUE> org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context createReduceContext(org.apache.hadoop.mapreduce.Reducer <INKEY,INVALUE,OUTKEY,OUTVALUE> reducer, Configuration job, org.apache.hadoop.mapreduce.TaskAttemptID taskId, RawKeyValueIterator rIter, org.apache.hadoop.mapreduce.Counter inputKeyCounter, org.apache.hadoop.mapreduce.Counter inputValueCounter, org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> output, org.apache.hadoop.mapreduce.OutputCommitter committer, org.apache.hadoop.mapreduce.StatusReporter reporter, RawComparator<INKEY> comparator, Class<INKEY> keyClass, Class<INVALUE> valueClass ) throws IOException, InterruptedException { org.apache.hadoop.mapreduce.ReduceContext<INKEY, INVALUE, OUTKEY, OUTVALUE> reduceContext = new ReduceContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, taskId, rIter, inputKeyCounter, inputValueCounter, output, committer, reporter, comparator, keyClass, valueClass); org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context reducerContext = new WrappedReducer<INKEY, INVALUE, OUTKEY, OUTVALUE>().getReducerContext( reduceContext); return reducerContext; }
public MergeQueue(Configuration conf, FileSystem fs, Path[] inputs, boolean deleteInputs, CompressionCodec codec, RawComparator<K> comparator, Progressable reporter, Counters.Counter mergedMapOutputsCounter, TaskType taskType) throws IOException { this.conf = conf; this.fs = fs; this.codec = codec; this.comparator = comparator; this.reporter = reporter; if (taskType == TaskType.MAP) { considerFinalMergeForProgress(); } for (Path file : inputs) { LOG.debug("MergeQ: adding: " + file); segments.add(new Segment<K, V>(conf, fs, file, codec, !deleteInputs, (file.toString().endsWith( Task.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter))); } // Sort segments on file-lengths Collections.sort(segments, segmentComparator); }
public MergeQueue(Configuration conf, FileSystem fs, List<Segment<K, V>> segments, RawComparator<K> comparator, Progressable reporter, boolean sortSegments, TaskType taskType) { this.conf = conf; this.fs = fs; this.comparator = comparator; this.segments = segments; this.reporter = reporter; if (taskType == TaskType.MAP) { considerFinalMergeForProgress(); } if (sortSegments) { Collections.sort(segments, segmentComparator); } }
public MergeQueue(Configuration conf, FileSystem fs, List<Segment<K, V>> segments, RawComparator<K> comparator, Progressable reporter, boolean sortSegments, CompressionCodec codec, TaskType taskType) { this(conf, fs, segments, comparator, reporter, sortSegments, taskType); this.codec = codec; }
public ReduceValuesIterator (RawKeyValueIterator in, RawComparator<KEY> comparator, Class<KEY> keyClass, Class<VALUE> valClass, Configuration conf, Progressable reporter) throws IOException { super(in, comparator, keyClass, valClass, conf, reporter); }
/** * Get the {@link RawComparator} comparator used to compare keys. * * @return the {@link RawComparator} comparator used to compare keys. */ public RawComparator getOutputKeyComparator() { Class<? extends RawComparator> theClass = getClass( JobContext.KEY_COMPARATOR, null, RawComparator.class); if (theClass != null) return ReflectionUtils.newInstance(theClass, this); return WritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class), this); }
/** * Get the user defined {@link WritableComparable} comparator for * grouping keys of inputs to the combiner. * * @return comparator set by the user for grouping values. * @see #setCombinerKeyGroupingComparator(Class) for details. */ public RawComparator getCombinerKeyGroupingComparator() { Class<? extends RawComparator> theClass = getClass( JobContext.COMBINER_GROUP_COMPARATOR_CLASS, null, RawComparator.class); if (theClass == null) { return getOutputKeyComparator(); } return ReflectionUtils.newInstance(theClass, this); }
/** * Get the user defined {@link WritableComparable} comparator for * grouping keys of inputs to the reduce. * * @return comparator set by the user for grouping values. * @see #setOutputValueGroupingComparator(Class) for details. */ public RawComparator getOutputValueGroupingComparator() { Class<? extends RawComparator> theClass = getClass( JobContext.GROUP_COMPARATOR_CLASS, null, RawComparator.class); if (theClass == null) { return getOutputKeyComparator(); } return ReflectionUtils.newInstance(theClass, this); }
/** * Read in the partition file and build indexing data structures. * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and * <tt>total.order.partitioner.natural.order</tt> is not false, a trie * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes * will be built. Otherwise, keys will be located using a binary search of * the partition keyset using the {@link org.apache.hadoop.io.RawComparator} * defined for this job. The input file must be sorted with the same * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys. */ @SuppressWarnings("unchecked") // keytype from conf not static public void setConf(Configuration conf) { try { this.conf = conf; String parts = getPartitionFile(conf); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache : partFile.getFileSystem(conf); Job job = Job.getInstance(conf); Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, keyClass, conf); if (splitPoints.length != job.getNumReduceTasks() - 1) { throw new IOException("Wrong number of partitions in keyset"); } RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); for (int i = 0; i < splitPoints.length - 1; ++i) { if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) { throw new IOException("Split points are out of order"); } } boolean natOrder = conf.getBoolean(NATURAL_ORDER, true); if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) { partitions = buildTrie((BinaryComparable[])splitPoints, 0, splitPoints.length, new byte[0], // Now that blocks of identical splitless trie nodes are // represented reentrantly, and we develop a leaf for any trie // node with only one split point, the only reason for a depth // limit is to refute stack overflow or bloat in the pathological // case where the split points are long and mostly look like bytes // iii...iixii...iii . Therefore, we make the default depth // limit large but not huge. conf.getInt(MAX_TRIE_DEPTH, 200)); } else { partitions = new BinarySearchNode(splitPoints, comparator); } } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }