@Override public void init() throws IOException { registerKey(NullWritable.class.getName(), NullWritableSerializer.class); registerKey(Text.class.getName(), TextSerializer.class); registerKey(LongWritable.class.getName(), LongWritableSerializer.class); registerKey(IntWritable.class.getName(), IntWritableSerializer.class); registerKey(Writable.class.getName(), DefaultSerializer.class); registerKey(BytesWritable.class.getName(), BytesWritableSerializer.class); registerKey(BooleanWritable.class.getName(), BoolWritableSerializer.class); registerKey(ByteWritable.class.getName(), ByteWritableSerializer.class); registerKey(FloatWritable.class.getName(), FloatWritableSerializer.class); registerKey(DoubleWritable.class.getName(), DoubleWritableSerializer.class); registerKey(VIntWritable.class.getName(), VIntWritableSerializer.class); registerKey(VLongWritable.class.getName(), VLongWritableSerializer.class); LOG.info("Hadoop platform inited"); }
@Override public void reduce(ByteWritable key, Iterator<Text> values, OutputCollector<Text, ByteWritable> output, Reporter reporter) throws IOException { while (values.hasNext()) { Text document = values.next(); writers.delete(document.toString()); totalDeleted++; reporter.incrCounter("CleaningJobStatus", "Deleted documents", 1); // if (numDeletes >= NUM_MAX_DELETE_REQUEST) { // LOG.info("CleaningJob: deleting " + numDeletes // + " documents"); // // TODO updateRequest.process(solr); // // TODO updateRequest = new UpdateRequest(); // writers.delete(key.toString()); // totalDeleted += numDeletes; // numDeletes = 0; // } } }
@Test public void testWriteByte() throws Exception { if (!canTest()) { return; } byte aByte = 8; template.sendBody("direct:write_byte", aByte); Configuration conf = new Configuration(); Path file1 = new Path("file:///" + TEMP_DIR.toUri() + "/test-camel-byte"); SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file1)); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); reader.next(key, value); byte rByte = ((ByteWritable) value).get(); assertEquals(rByte, aByte); IOHelper.close(reader); }
@Test public void testWriteByte() throws Exception { if (!canTest()) { return; } byte aByte = 8; template.sendBody("direct:write_byte", aByte); Configuration conf = new Configuration(); Path file1 = new Path("file:///" + TEMP_DIR.toUri() + "/test-camel-byte"); FileSystem fs1 = FileSystem.get(file1.toUri(), conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs1, file1, conf); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); reader.next(key, value); byte rByte = ((ByteWritable) value).get(); assertEquals(rByte, aByte); IOHelper.close(reader); }
@Override @SuppressWarnings("unchecked") public void reduce(ByteWritable key, Iterator<OffsetCount> values, OutputCollector<ByteWritable, OffsetCount> out, Reporter report) throws IOException { //need to sort the values by filename and fileoffset while(values.hasNext()) list.add(new OffsetCount(values.next())); Collections.sort(list); long lineOffset=0; for(OffsetCount oc: list) { long count=oc.count; oc.count=lineOffset; out.collect(key, oc); lineOffset+=count; } report.incrCounter(CSVReblockMR.NUM_ROWS_IN_MATRIX, key.toString(), lineOffset); list.clear(); }
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrClean: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.set(SolrConstants.SERVER_URL, solrUrl); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(SolrDeleter.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
/** * Returns the corresponding Writable object for this column type. */ public Writable getWritableInstance(com.cloudera.recordservice.core.Schema.Type type) { switch (type) { case BOOLEAN: return new BooleanWritable(); case TINYINT: return new ByteWritable(); case SMALLINT: return new ShortWritable(); case INT: return new IntWritable(); case BIGINT: return new LongWritable(); case FLOAT: return new FloatWritable(); case DOUBLE: return new DoubleWritable(); case VARCHAR: case CHAR: case STRING: return new Text(); case TIMESTAMP_NANOS: return new TimestampNanosWritable(); case DECIMAL: return new DecimalWritable(); default: throw new UnsupportedOperationException( "Unexpected type: " + toString()); } }
private static final WritableComparable<?> createWritable(DataType type) { switch (type) { case BOOLEAN: return new BooleanWritable(); case BYTE: return new ByteWritable(); case INT: return new IntWritable(); case LONG: return new LongWritable(); case FLOAT: return new FloatWritable(); case DOUBLE: return new DoubleWritable(); case STRING: return new Text(); default: return null; } }
protected void reduce(ByteWritable key, Iterable<RowNumberWritable> values, Context context) throws IOException, InterruptedException { Iterator<RowNumberWritable> itr = values.iterator(); if (!itr.hasNext()) { return; } long offset = 0; RowNumberWritable value = itr.next(); while (itr.hasNext() && value.getCount() > 0) { offset += value.getCount(); value = itr.next(); } outputKey.set(Long.toString(offset++)); context.write(outputKey, value.getValue()); while(itr.hasNext()) { value = itr.next(); outputKey.set(Long.toString(offset++)); context.write(outputKey, value.getValue()); } }
protected Object translateWritableToPigDataType(Writable w, byte dataType) { switch(dataType) { case DataType.CHARARRAY: return ((Text) w).toString(); case DataType.BYTEARRAY: BytesWritable bw = (BytesWritable) w; // Make a copy return new DataByteArray(bw.getBytes(), 0, bw.getLength()); case DataType.BOOLEAN: return ((BooleanWritable) w).get(); case DataType.INTEGER: return ((IntWritable) w).get(); case DataType.LONG: return ((LongWritable) w).get(); case DataType.FLOAT: return ((FloatWritable) w).get(); case DataType.DOUBLE: return ((DoubleWritable) w).get(); case DataType.BYTE: return ((ByteWritable) w).get(); case DataType.DATETIME: return ((DateTimeWritable) w).get(); } return null; }
@SuppressWarnings("unchecked") private<T> T convert(Record flinkType, int pos, Class<T> hadoopType) { if(hadoopType == LongWritable.class ) { return (T) new LongWritable((flinkType.getField(pos, LongValue.class)).getValue()); } if(hadoopType == org.apache.hadoop.io.Text.class) { return (T) new Text((flinkType.getField(pos, StringValue.class)).getValue()); } if(hadoopType == org.apache.hadoop.io.IntWritable.class) { return (T) new IntWritable((flinkType.getField(pos, IntValue.class)).getValue()); } if(hadoopType == org.apache.hadoop.io.FloatWritable.class) { return (T) new FloatWritable((flinkType.getField(pos, FloatValue.class)).getValue()); } if(hadoopType == org.apache.hadoop.io.DoubleWritable.class) { return (T) new DoubleWritable((flinkType.getField(pos, DoubleValue.class)).getValue()); } if(hadoopType == org.apache.hadoop.io.BooleanWritable.class) { return (T) new BooleanWritable((flinkType.getField(pos, BooleanValue.class)).getValue()); } if(hadoopType == org.apache.hadoop.io.ByteWritable.class) { return (T) new ByteWritable((flinkType.getField(pos, ByteValue.class)).getValue()); } throw new RuntimeException("Unable to convert Flink type ("+flinkType.getClass().getCanonicalName()+") to Hadoop."); }
public void write(Writable w) throws IOException { if (w instanceof TypedBytesWritable) { writeTypedBytes((TypedBytesWritable) w); } else if (w instanceof BytesWritable) { writeBytes((BytesWritable) w); } else if (w instanceof ByteWritable) { writeByte((ByteWritable) w); } else if (w instanceof BooleanWritable) { writeBoolean((BooleanWritable) w); } else if (w instanceof IntWritable) { writeInt((IntWritable) w); } else if (w instanceof VIntWritable) { writeVInt((VIntWritable) w); } else if (w instanceof LongWritable) { writeLong((LongWritable) w); } else if (w instanceof VLongWritable) { writeVLong((VLongWritable) w); } else if (w instanceof FloatWritable) { writeFloat((FloatWritable) w); } else if (w instanceof DoubleWritable) { writeDouble((DoubleWritable) w); } else if (w instanceof Text) { writeText((Text) w); } else if (w instanceof ArrayWritable) { writeArray((ArrayWritable) w); } else if (w instanceof MapWritable) { writeMap((MapWritable) w); } else if (w instanceof SortedMapWritable) { writeSortedMap((SortedMapWritable) w); } else if (w instanceof Record) { writeRecord((Record) w); } else { writeWritable(w); // last resort } }
public Class<? extends Writable> readType() throws IOException { Type type = in.readType(); if (type == null) { return null; } switch (type) { case BYTES: return BytesWritable.class; case BYTE: return ByteWritable.class; case BOOL: return BooleanWritable.class; case INT: return VIntWritable.class; case LONG: return VLongWritable.class; case FLOAT: return FloatWritable.class; case DOUBLE: return DoubleWritable.class; case STRING: return Text.class; case VECTOR: return ArrayWritable.class; case MAP: return MapWritable.class; case WRITABLE: return Writable.class; default: throw new RuntimeException("unknown type"); } }
public static void updateObject(Writable obj, byte[] seed) { if (obj instanceof IntWritable) { ((IntWritable)obj).set(Ints.fromByteArray(seed)); } else if (obj instanceof FloatWritable) { ((FloatWritable)obj).set(r.nextFloat()); } else if (obj instanceof DoubleWritable) { ((DoubleWritable)obj).set(r.nextDouble()); } else if (obj instanceof LongWritable) { ((LongWritable)obj).set(Longs.fromByteArray(seed)); } else if (obj instanceof VIntWritable) { ((VIntWritable)obj).set(Ints.fromByteArray(seed)); } else if (obj instanceof VLongWritable) { ((VLongWritable)obj).set(Longs.fromByteArray(seed)); } else if (obj instanceof BooleanWritable) { ((BooleanWritable)obj).set(seed[0] % 2 == 1 ? true : false); } else if (obj instanceof Text) { ((Text)obj).set(BytesUtil.toStringBinary(seed)); } else if (obj instanceof ByteWritable) { ((ByteWritable)obj).set(seed.length > 0 ? seed[0] : 0); } else if (obj instanceof BytesWritable) { ((BytesWritable)obj).set(seed, 0, seed.length); } else if (obj instanceof UTF8) { ((UTF8)obj).set(BytesUtil.toStringBinary(seed)); } else if (obj instanceof MockValueClass) { ((MockValueClass)obj).set(seed); } else { throw new IllegalArgumentException("unknown writable: " + obj.getClass().getName()); } }
public static <VTYPE> byte[] toBytes(VTYPE obj) { final String className = obj.getClass().getName(); if (className.equals(IntWritable.class.getName())) { return Ints.toByteArray(((IntWritable) obj).get()); } else if (className.equals(FloatWritable.class.getName())) { return BytesUtil.toBytes(((FloatWritable) obj).get()); } else if (className.equals(DoubleWritable.class.getName())) { return BytesUtil.toBytes(((DoubleWritable) obj).get()); } else if (className.equals(LongWritable.class.getName())) { return Longs.toByteArray(((LongWritable) obj).get()); } else if (className.equals(VIntWritable.class.getName())) { return Ints.toByteArray(((VIntWritable) obj).get()); } else if (className.equals(VLongWritable.class.getName())) { return Longs.toByteArray(((VLongWritable) obj).get()); } else if (className.equals(BooleanWritable.class.getName())) { return BytesUtil.toBytes(((BooleanWritable) obj).get()); } else if (className.equals(Text.class.getName())) { return ((Text)obj).copyBytes(); } else if (className.equals(ByteWritable.class.getName())) { return Ints.toByteArray((int) ((ByteWritable) obj).get()); } else if (className.equals(BytesWritable.class.getName())) { // TODO: copyBytes instead? return ((BytesWritable) obj).getBytes(); } else { return new byte[0]; } }
public void write(Writable w) throws IOException { if (w instanceof TypedBytesWritable) { writeTypedBytes((TypedBytesWritable) w); } else if (w instanceof BytesWritable) { writeBytes((BytesWritable) w); } else if (w instanceof ByteWritable) { writeByte((ByteWritable) w); } else if (w instanceof BooleanWritable) { writeBoolean((BooleanWritable) w); } else if (w instanceof IntWritable) { writeInt((IntWritable) w); } else if (w instanceof VIntWritable) { writeVInt((VIntWritable) w); } else if (w instanceof LongWritable) { writeLong((LongWritable) w); } else if (w instanceof VLongWritable) { writeVLong((VLongWritable) w); } else if (w instanceof FloatWritable) { writeFloat((FloatWritable) w); } else if (w instanceof DoubleWritable) { writeDouble((DoubleWritable) w); } else if (w instanceof Text) { writeText((Text) w); } else if (w instanceof ArrayWritable) { writeArray((ArrayWritable) w); } else if (w instanceof MapWritable) { writeMap((MapWritable) w); } else if (w instanceof SortedMapWritable) { writeSortedMap((SortedMapWritable<?>) w); } else if (w instanceof Record) { writeRecord((Record) w); } else { writeWritable(w); // last resort } }
@Override public void map(Text key, CrawlDatum value, OutputCollector<ByteWritable, Text> output, Reporter reporter) throws IOException { if (value.getStatus() == CrawlDatum.STATUS_DB_GONE || value.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { output.collect(OUT, key); } }
public void delete(String crawldb, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("CleaningJob: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(DeleterReducer.class); job.setJobName("CleaningJob"); // need to expicitely allow deletions job.setBoolean(IndexerMapReduce.INDEXER_DELETE, true); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
public static Writable toWritable(Object object) { if (object == null) { return null; //return NullWritable.get(); } if (object instanceof Writable) { return (Writable) object; } if (object instanceof String) { return new Text((String) object); } if (object instanceof Long) { return new VLongWritable((Long) object); } if (object instanceof Integer) { return new VIntWritable((Integer) object); } if (object instanceof Byte) { return new ByteWritable((Byte) object); } if (object instanceof Double) { return new DoubleWritable((Double) object); } if (object instanceof Float) { return new FloatWritable((Float) object); } if (object instanceof Boolean) { return new BooleanWritable((Boolean) object); } if (object instanceof byte[]) { return new BytesWritable((byte[]) object); } return new BytesWritable(object.toString().getBytes()); }
@Override public Writable create(Object value, TypeConverter typeConverter, Holder<Integer> size) { size.value = SIZE; ByteWritable writable = new ByteWritable(); writable.set(typeConverter.convertTo(Byte.class, value)); return writable; }
@Test public void testReadByte() throws Exception { if (!canTest()) { return; } final Path file = new Path(new File("target/test/test-camel-byte").getAbsolutePath()); Configuration conf = new Configuration(); SequenceFile.Writer writer = createWriter(conf, file, NullWritable.class, ByteWritable.class); NullWritable keyWritable = NullWritable.get(); ByteWritable valueWritable = new ByteWritable(); byte value = 3; valueWritable.set(value); writer.append(keyWritable, valueWritable); writer.sync(); writer.close(); MockEndpoint resultEndpoint = context.getEndpoint("mock:result", MockEndpoint.class); resultEndpoint.expectedMessageCount(1); resultEndpoint.message(0).body(byte.class).isEqualTo(3); context.addRoutes(new RouteBuilder() { public void configure() { from("hdfs2:localhost/" + file.toUri() + "?fileSystemType=LOCAL&fileType=SEQUENCE_FILE&initialDelay=0").to("mock:result"); } }); context.start(); resultEndpoint.assertIsSatisfied(); }
@Test public void testReadByte() throws Exception { if (!canTest()) { return; } final Path file = new Path(new File("target/test/test-camel-byte").getAbsolutePath()); Configuration conf = new Configuration(); FileSystem fs1 = FileSystem.get(file.toUri(), conf); SequenceFile.Writer writer = createWriter(fs1, conf, file, NullWritable.class, ByteWritable.class); NullWritable keyWritable = NullWritable.get(); ByteWritable valueWritable = new ByteWritable(); byte value = 3; valueWritable.set(value); writer.append(keyWritable, valueWritable); writer.sync(); writer.close(); MockEndpoint resultEndpoint = context.getEndpoint("mock:result", MockEndpoint.class); resultEndpoint.expectedMessageCount(1); resultEndpoint.message(0).body(byte.class).isEqualTo(3); context.addRoutes(new RouteBuilder() { public void configure() { from("hdfs:localhost/" + file.toUri() + "?fileSystemType=LOCAL&fileType=SEQUENCE_FILE&initialDelay=0").to("mock:result"); } }); context.start(); resultEndpoint.assertIsSatisfied(); }
@Override public void map(LongWritable key, Text value, OutputCollector<ByteWritable, OffsetCount> out, Reporter report) throws IOException { if(first) { first = false; fileOffset = key.get(); outCache = out; } //getting the number of colums if(key.get()==0 && headerFile) { if(!ignoreFirstLine) { report.incrCounter(CSVReblockMR.NUM_COLS_IN_MATRIX, outKey.toString(), value.toString().split(delim, -1).length); num++; } else realFirstLine = true; } else { if(realFirstLine) { report.incrCounter(CSVReblockMR.NUM_COLS_IN_MATRIX, outKey.toString(), value.toString().split(delim, -1).length); realFirstLine = false; } num++; } }
@Override public void map(Text key, CrawlDatum value, OutputCollector<ByteWritable, Text> output, Reporter reporter) throws IOException { if (value.getStatus() == CrawlDatum.STATUS_DB_GONE) { output.collect(OUT, key); } }