/** * This should be primarily used for testing. * @return clone of replica store in datanode memory */ ReplicaInfo fetchReplicaInfo(String bpid, long blockId) { ReplicaInfo r = volumeMap.get(bpid, blockId); if(r == null) return null; switch(r.getState()) { case FINALIZED: return new FinalizedReplica((FinalizedReplica)r); case RBW: return new ReplicaBeingWritten((ReplicaBeingWritten)r); case RWR: return new ReplicaWaitingToBeRecovered((ReplicaWaitingToBeRecovered)r); case RUR: return new ReplicaUnderRecovery((ReplicaUnderRecovery)r); case TEMPORARY: return new ReplicaInPipeline((ReplicaInPipeline)r); } return null; }
/** * Returns a clone of a replica stored in data-node memory. * Should be primarily used for testing. * @param blockId * @return */ ReplicaInfo fetchReplicaInfo(String bpid, long blockId) { ReplicaInfo r = volumeMap.get(bpid, blockId); if(r == null) return null; switch(r.getState()) { case FINALIZED: return new FinalizedReplica((FinalizedReplica)r); case RBW: return new ReplicaBeingWritten((ReplicaBeingWritten)r); case RWR: return new ReplicaWaitingToBeRecovered((ReplicaWaitingToBeRecovered)r); case RUR: return new ReplicaUnderRecovery((ReplicaUnderRecovery)r); case TEMPORARY: return new ReplicaInPipeline((ReplicaInPipeline)r); } return null; }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline recoverAppend(ExtendedBlock b, long newGS, long expectedBlockLen) throws IOException { LOG.info("Recover failed append to " + b); ReplicaInfo replicaInfo = recoverCheck(b, newGS, expectedBlockLen); // change the replica's state/gs etc. if (replicaInfo.getState() == ReplicaState.FINALIZED ) { return append(b.getBlockPoolId(), (FinalizedReplica) replicaInfo, newGS, b.getNumBytes()); } else { //RBW bumpReplicaGS(replicaInfo, newGS); return (ReplicaBeingWritten)replicaInfo; } }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline createRbw(ExtendedBlock b) throws IOException { ReplicaInfo replicaInfo = volumeMap.get(b.getBlockPoolId(), b.getBlockId()); if (replicaInfo != null) { throw new ReplicaAlreadyExistsException("Block " + b + " already exists in state " + replicaInfo.getState() + " and thus cannot be created."); } // create a new block FsVolumeImpl v = volumes.getNextVolume(b.getNumBytes()); // create a rbw file to hold block in the designated volume File f = v.createRbwFile(b.getBlockPoolId(), b.getLocalBlock()); ReplicaBeingWritten newReplicaInfo = new ReplicaBeingWritten(b.getBlockId(), b.getGenerationStamp(), v, f.getParentFile()); volumeMap.add(b.getBlockPoolId(), newReplicaInfo); return newReplicaInfo; }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline createTemporary(ExtendedBlock b) throws IOException { ReplicaInfo replicaInfo = volumeMap.get(b.getBlockPoolId(), b.getBlockId()); if (replicaInfo != null) { throw new ReplicaAlreadyExistsException("Block " + b + " already exists in state " + replicaInfo.getState() + " and thus cannot be created."); } FsVolumeImpl v = volumes.getNextVolume(b.getNumBytes()); // create a temporary file to hold block in the designated volume File f = v.createTmpFile(b.getBlockPoolId(), b.getLocalBlock()); ReplicaInPipeline newReplicaInfo = new ReplicaInPipeline(b.getBlockId(), b.getGenerationStamp(), v, f.getParentFile()); volumeMap.add(b.getBlockPoolId(), newReplicaInfo); return newReplicaInfo; }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline createTemporary(StorageType storageType, ExtendedBlock b) throws IOException { ReplicaInfo replicaInfo = volumeMap.get(b.getBlockPoolId(), b.getBlockId()); if (replicaInfo != null) { if (replicaInfo.getGenerationStamp() < b.getGenerationStamp() && replicaInfo instanceof ReplicaInPipeline) { // Stop the previous writer ((ReplicaInPipeline)replicaInfo) .stopWriter(datanode.getDnConf().getXceiverStopTimeout()); invalidate(b.getBlockPoolId(), new Block[]{replicaInfo}); } else { throw new ReplicaAlreadyExistsException("Block " + b + " already exists in state " + replicaInfo.getState() + " and thus cannot be created."); } } FsVolumeImpl v = volumes.getNextVolume(storageType, b.getNumBytes()); // create a temporary file to hold block in the designated volume File f = v.createTmpFile(b.getBlockPoolId(), b.getLocalBlock()); ReplicaInPipeline newReplicaInfo = new ReplicaInPipeline(b.getBlockId(), b.getGenerationStamp(), v, f.getParentFile(), 0); volumeMap.add(b.getBlockPoolId(), newReplicaInfo); return newReplicaInfo; }
/** * Returns a clone of a replica stored in data-node memory. * Should be primarily used for testing. * * @param blockId * @return */ ReplicaInfo fetchReplicaInfo(String bpid, long blockId) { ReplicaInfo r = volumeMap.get(bpid, blockId); if (r == null) { return null; } switch (r.getState()) { case FINALIZED: return new FinalizedReplica((FinalizedReplica) r); case RBW: return new ReplicaBeingWritten((ReplicaBeingWritten) r); case RWR: return new ReplicaWaitingToBeRecovered((ReplicaWaitingToBeRecovered) r); case RUR: return new ReplicaUnderRecovery((ReplicaUnderRecovery) r); case TEMPORARY: return new ReplicaInPipeline((ReplicaInPipeline) r); } return null; }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline recoverAppend(ExtendedBlock b, long newGS, long expectedBlockLen) throws IOException { LOG.info("Recover failed append to " + b); ReplicaInfo replicaInfo = recoverCheck(b, newGS, expectedBlockLen); // change the replica's state/gs etc. if (replicaInfo.getState() == ReplicaState.FINALIZED) { return append(b.getBlockPoolId(), (FinalizedReplica) replicaInfo, newGS, b.getNumBytes()); } else { //RBW bumpReplicaGS(replicaInfo, newGS); return (ReplicaBeingWritten) replicaInfo; } }
@Override public Replica createReplicaInPipeline( FsVolumeSpi volume, ExtendedBlock block) throws IOException { FsVolumeImpl vol = (FsVolumeImpl) volume; ReplicaInPipeline rip = new ReplicaInPipeline( block.getBlockId(), block.getGenerationStamp(), volume, vol.createTmpFile( block.getBlockPoolId(), block.getLocalBlock()).getParentFile(), 0); dataset.volumeMap.add(block.getBlockPoolId(), rip); return rip; }
@Override // FsDatasetSpi public synchronized ReplicaHandler createTemporary( StorageType storageType, ExtendedBlock b) throws IOException { ReplicaInfo replicaInfo = volumeMap.get(b.getBlockPoolId(), b.getBlockId()); if (replicaInfo != null) { if (replicaInfo.getGenerationStamp() < b.getGenerationStamp() && replicaInfo instanceof ReplicaInPipeline) { // Stop the previous writer ((ReplicaInPipeline)replicaInfo) .stopWriter(datanode.getDnConf().getXceiverStopTimeout()); invalidate(b.getBlockPoolId(), new Block[]{replicaInfo}); } else { throw new ReplicaAlreadyExistsException("Block " + b + " already exists in state " + replicaInfo.getState() + " and thus cannot be created."); } } FsVolumeReference ref = volumes.getNextVolume(storageType, b.getNumBytes()); FsVolumeImpl v = (FsVolumeImpl) ref.getVolume(); // create a temporary file to hold block in the designated volume File f; try { f = v.createTmpFile(b.getBlockPoolId(), b.getLocalBlock()); } catch (IOException e) { IOUtils.cleanup(null, ref); throw e; } ReplicaInPipeline newReplicaInfo = new ReplicaInPipeline(b.getBlockId(), b.getGenerationStamp(), v, f.getParentFile(), 0); volumeMap.add(b.getBlockPoolId(), newReplicaInfo); return new ReplicaHandler(newReplicaInfo, ref); }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline append(ExtendedBlock b, long newGS, long expectedBlockLen) throws IOException { // If the block was successfully finalized because all packets // were successfully processed at the Datanode but the ack for // some of the packets were not received by the client. The client // re-opens the connection and retries sending those packets. // The other reason is that an "append" is occurring to this block. // check the validity of the parameter if (newGS < b.getGenerationStamp()) { throw new IOException("The new generation stamp " + newGS + " should be greater than the replica " + b + "'s generation stamp"); } ReplicaInfo replicaInfo = getReplicaInfo(b); LOG.info("Appending to " + replicaInfo); if (replicaInfo.getState() != ReplicaState.FINALIZED) { throw new ReplicaNotFoundException( ReplicaNotFoundException.UNFINALIZED_REPLICA + b); } if (replicaInfo.getNumBytes() != expectedBlockLen) { throw new IOException("Corrupted replica " + replicaInfo + " with a length of " + replicaInfo.getNumBytes() + " expected length is " + expectedBlockLen); } return append(b.getBlockPoolId(), (FinalizedReplica)replicaInfo, newGS, b.getNumBytes()); }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline createRbw(StorageType storageType, ExtendedBlock b, boolean allowLazyPersist) throws IOException { ReplicaInfo replicaInfo = volumeMap.get(b.getBlockPoolId(), b.getBlockId()); if (replicaInfo != null) { throw new ReplicaAlreadyExistsException("Block " + b + " already exists in state " + replicaInfo.getState() + " and thus cannot be created."); } // create a new block FsVolumeImpl v; while (true) { try { if (allowLazyPersist) { // First try to place the block on a transient volume. v = volumes.getNextTransientVolume(b.getNumBytes()); datanode.getMetrics().incrRamDiskBlocksWrite(); } else { v = volumes.getNextVolume(storageType, b.getNumBytes()); } } catch (DiskOutOfSpaceException de) { if (allowLazyPersist) { datanode.getMetrics().incrRamDiskBlocksWriteFallback(); allowLazyPersist = false; continue; } throw de; } break; } // create an rbw file to hold block in the designated volume File f = v.createRbwFile(b.getBlockPoolId(), b.getLocalBlock()); ReplicaBeingWritten newReplicaInfo = new ReplicaBeingWritten(b.getBlockId(), b.getGenerationStamp(), v, f.getParentFile(), b.getNumBytes()); volumeMap.add(b.getBlockPoolId(), newReplicaInfo); return newReplicaInfo; }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline append(ExtendedBlock b, long newGS, long expectedBlockLen) throws IOException { // If the block was successfully finalized because all packets // were successfully processed at the Datanode but the ack for // some of the packets were not received by the client. The client // re-opens the connection and retries sending those packets. // The other reason is that an "append" is occurring to this block. // check the validity of the parameter if (newGS < b.getGenerationStamp()) { throw new IOException("The new generation stamp " + newGS + " should be greater than the replica " + b + "'s generation stamp"); } ReplicaInfo replicaInfo = getReplicaInfo(b); LOG.info("Appending to " + replicaInfo); if (replicaInfo.getState() != ReplicaState.FINALIZED) { throw new ReplicaNotFoundException( ReplicaNotFoundException.UNFINALIZED_REPLICA + b); } if (replicaInfo.getNumBytes() != expectedBlockLen) { throw new IOException("Corrupted replica " + replicaInfo + " with a length of " + replicaInfo.getNumBytes() + " expected length is " + expectedBlockLen); } return append(b.getBlockPoolId(), (FinalizedReplica) replicaInfo, newGS, b.getNumBytes()); }
/** * Move block files from one storage to another storage. * @return Returns the Old replicaInfo * @throws IOException */ @Override public ReplicaInfo moveBlockAcrossStorage(ExtendedBlock block, StorageType targetStorageType) throws IOException { ReplicaInfo replicaInfo = getReplicaInfo(block); if (replicaInfo.getState() != ReplicaState.FINALIZED) { throw new ReplicaNotFoundException( ReplicaNotFoundException.UNFINALIZED_REPLICA + block); } if (replicaInfo.getNumBytes() != block.getNumBytes()) { throw new IOException("Corrupted replica " + replicaInfo + " with a length of " + replicaInfo.getNumBytes() + " expected length is " + block.getNumBytes()); } if (replicaInfo.getVolume().getStorageType() == targetStorageType) { throw new ReplicaAlreadyExistsException("Replica " + replicaInfo + " already exists on storage " + targetStorageType); } if (replicaInfo.isOnTransientStorage()) { // Block movement from RAM_DISK will be done by LazyPersist mechanism throw new IOException("Replica " + replicaInfo + " cannot be moved from storageType : " + replicaInfo.getVolume().getStorageType()); } try (FsVolumeReference volumeRef = volumes.getNextVolume( targetStorageType, block.getNumBytes())) { File oldBlockFile = replicaInfo.getBlockFile(); File oldMetaFile = replicaInfo.getMetaFile(); FsVolumeImpl targetVolume = (FsVolumeImpl) volumeRef.getVolume(); // Copy files to temp dir first File[] blockFiles = copyBlockFiles(block.getBlockId(), block.getGenerationStamp(), oldMetaFile, oldBlockFile, targetVolume.getTmpDir(block.getBlockPoolId()), replicaInfo.isOnTransientStorage()); ReplicaInfo newReplicaInfo = new ReplicaInPipeline( replicaInfo.getBlockId(), replicaInfo.getGenerationStamp(), targetVolume, blockFiles[0].getParentFile(), 0); newReplicaInfo.setNumBytes(blockFiles[1].length()); // Finalize the copied files newReplicaInfo = finalizeReplica(block.getBlockPoolId(), newReplicaInfo); removeOldReplica(replicaInfo, newReplicaInfo, oldBlockFile, oldMetaFile, oldBlockFile.length(), oldMetaFile.length(), block.getBlockPoolId()); } // Replace the old block if any to reschedule the scanning. return replicaInfo; }
@Override // FsDatasetSpi public ReplicaHandler createTemporary( StorageType storageType, ExtendedBlock b) throws IOException { long startTimeMs = Time.monotonicNow(); long writerStopTimeoutMs = datanode.getDnConf().getXceiverStopTimeout(); ReplicaInfo lastFoundReplicaInfo = null; do { synchronized (this) { ReplicaInfo currentReplicaInfo = volumeMap.get(b.getBlockPoolId(), b.getBlockId()); if (currentReplicaInfo == lastFoundReplicaInfo) { if (lastFoundReplicaInfo != null) { invalidate(b.getBlockPoolId(), new Block[] { lastFoundReplicaInfo }); } FsVolumeReference ref = volumes.getNextVolume(storageType, b.getNumBytes()); FsVolumeImpl v = (FsVolumeImpl) ref.getVolume(); // create a temporary file to hold block in the designated volume File f; try { f = v.createTmpFile(b.getBlockPoolId(), b.getLocalBlock()); } catch (IOException e) { IOUtils.cleanup(null, ref); throw e; } ReplicaInPipeline newReplicaInfo = new ReplicaInPipeline(b.getBlockId(), b.getGenerationStamp(), v, f.getParentFile(), 0); volumeMap.add(b.getBlockPoolId(), newReplicaInfo); return new ReplicaHandler(newReplicaInfo, ref); } else { if (!(currentReplicaInfo.getGenerationStamp() < b .getGenerationStamp() && currentReplicaInfo instanceof ReplicaInPipeline)) { throw new ReplicaAlreadyExistsException("Block " + b + " already exists in state " + currentReplicaInfo.getState() + " and thus cannot be created."); } lastFoundReplicaInfo = currentReplicaInfo; } } // Hang too long, just bail out. This is not supposed to happen. long writerStopMs = Time.monotonicNow() - startTimeMs; if (writerStopMs > writerStopTimeoutMs) { LOG.warn("Unable to stop existing writer for block " + b + " after " + writerStopMs + " miniseconds."); throw new IOException("Unable to stop existing writer for block " + b + " after " + writerStopMs + " miniseconds."); } // Stop the previous writer ((ReplicaInPipeline) lastFoundReplicaInfo) .stopWriter(writerStopTimeoutMs); } while (true); }
/** * Move block files from one storage to another storage. * @return Returns the Old replicaInfo * @throws IOException */ @Override public ReplicaInfo moveBlockAcrossStorage(ExtendedBlock block, StorageType targetStorageType) throws IOException { ReplicaInfo replicaInfo = getReplicaInfo(block); if (replicaInfo.getState() != ReplicaState.FINALIZED) { throw new ReplicaNotFoundException( ReplicaNotFoundException.UNFINALIZED_REPLICA + block); } if (replicaInfo.getNumBytes() != block.getNumBytes()) { throw new IOException("Corrupted replica " + replicaInfo + " with a length of " + replicaInfo.getNumBytes() + " expected length is " + block.getNumBytes()); } if (replicaInfo.getVolume().getStorageType() == targetStorageType) { throw new ReplicaAlreadyExistsException("Replica " + replicaInfo + " already exists on storage " + targetStorageType); } if (replicaInfo.isOnTransientStorage()) { // Block movement from RAM_DISK will be done by LazyPersist mechanism throw new IOException("Replica " + replicaInfo + " cannot be moved from storageType : " + replicaInfo.getVolume().getStorageType()); } try (FsVolumeReference volumeRef = volumes.getNextVolume( targetStorageType, block.getNumBytes())) { File oldBlockFile = replicaInfo.getBlockFile(); File oldMetaFile = replicaInfo.getMetaFile(); FsVolumeImpl targetVolume = (FsVolumeImpl) volumeRef.getVolume(); // Copy files to temp dir first File[] blockFiles = copyBlockFiles(block.getBlockId(), block.getGenerationStamp(), oldMetaFile, oldBlockFile, targetVolume.getTmpDir(block.getBlockPoolId()), replicaInfo.isOnTransientStorage(), smallBufferSize, conf); ReplicaInfo newReplicaInfo = new ReplicaInPipeline( replicaInfo.getBlockId(), replicaInfo.getGenerationStamp(), targetVolume, blockFiles[0].getParentFile(), 0); newReplicaInfo.setNumBytes(blockFiles[1].length()); // Finalize the copied files newReplicaInfo = finalizeReplica(block.getBlockPoolId(), newReplicaInfo); removeOldReplica(replicaInfo, newReplicaInfo, oldBlockFile, oldMetaFile, oldBlockFile.length(), oldMetaFile.length(), block.getBlockPoolId()); } // Replace the old block if any to reschedule the scanning. return replicaInfo; }
@Override // FsDatasetSpi public ReplicaHandler createTemporary( StorageType storageType, ExtendedBlock b) throws IOException { long startTimeMs = Time.monotonicNow(); long writerStopTimeoutMs = datanode.getDnConf().getXceiverStopTimeout(); ReplicaInfo lastFoundReplicaInfo = null; do { synchronized (this) { ReplicaInfo currentReplicaInfo = volumeMap.get(b.getBlockPoolId(), b.getBlockId()); if (currentReplicaInfo == lastFoundReplicaInfo) { if (lastFoundReplicaInfo != null) { invalidate(b.getBlockPoolId(), new Block[] { lastFoundReplicaInfo }); } FsVolumeReference ref = volumes.getNextVolume(storageType, b.getNumBytes()); FsVolumeImpl v = (FsVolumeImpl) ref.getVolume(); // create a temporary file to hold block in the designated volume File f; try { f = v.createTmpFile(b.getBlockPoolId(), b.getLocalBlock()); } catch (IOException e) { IOUtils.cleanup(null, ref); throw e; } ReplicaInPipeline newReplicaInfo = new ReplicaInPipeline(b.getBlockId(), b.getGenerationStamp(), v, f.getParentFile(), b.getLocalBlock().getNumBytes()); volumeMap.add(b.getBlockPoolId(), newReplicaInfo); return new ReplicaHandler(newReplicaInfo, ref); } else { if (!(currentReplicaInfo.getGenerationStamp() < b .getGenerationStamp() && currentReplicaInfo instanceof ReplicaInPipeline)) { throw new ReplicaAlreadyExistsException("Block " + b + " already exists in state " + currentReplicaInfo.getState() + " and thus cannot be created."); } lastFoundReplicaInfo = currentReplicaInfo; } } // Hang too long, just bail out. This is not supposed to happen. long writerStopMs = Time.monotonicNow() - startTimeMs; if (writerStopMs > writerStopTimeoutMs) { LOG.warn("Unable to stop existing writer for block " + b + " after " + writerStopMs + " miniseconds."); throw new IOException("Unable to stop existing writer for block " + b + " after " + writerStopMs + " miniseconds."); } // Stop the previous writer ((ReplicaInPipeline) lastFoundReplicaInfo) .stopWriter(writerStopTimeoutMs); } while (true); }
@Override // FsDatasetSpi public synchronized ReplicaInPipeline recoverRbw(ExtendedBlock b, long newGS, long minBytesRcvd, long maxBytesRcvd) throws IOException { LOG.info("Recover RBW replica " + b); ReplicaInfo replicaInfo = getReplicaInfo(b.getBlockPoolId(), b.getBlockId()); // check the replica's state if (replicaInfo.getState() != ReplicaState.RBW) { throw new ReplicaNotFoundException( ReplicaNotFoundException.NON_RBW_REPLICA + replicaInfo); } ReplicaBeingWritten rbw = (ReplicaBeingWritten)replicaInfo; LOG.info("Recovering " + rbw); // Stop the previous writer rbw.stopWriter(datanode.getDnConf().getXceiverStopTimeout()); rbw.setWriter(Thread.currentThread()); // check generation stamp long replicaGenerationStamp = rbw.getGenerationStamp(); if (replicaGenerationStamp < b.getGenerationStamp() || replicaGenerationStamp > newGS) { throw new ReplicaNotFoundException( ReplicaNotFoundException.UNEXPECTED_GS_REPLICA + b + ". Expected GS range is [" + b.getGenerationStamp() + ", " + newGS + "]."); } // check replica length long bytesAcked = rbw.getBytesAcked(); long numBytes = rbw.getNumBytes(); if (bytesAcked < minBytesRcvd || numBytes > maxBytesRcvd){ throw new ReplicaNotFoundException("Unmatched length replica " + replicaInfo + ": BytesAcked = " + bytesAcked + " BytesRcvd = " + numBytes + " are not in the range of [" + minBytesRcvd + ", " + maxBytesRcvd + "]."); } // Truncate the potentially corrupt portion. // If the source was client and the last node in the pipeline was lost, // any corrupt data written after the acked length can go unnoticed. if (numBytes > bytesAcked) { final File replicafile = rbw.getBlockFile(); truncateBlock(replicafile, rbw.getMetaFile(), numBytes, bytesAcked); rbw.setNumBytes(bytesAcked); rbw.setLastChecksumAndDataLen(bytesAcked, null); } // bump the replica's generation stamp to newGS bumpReplicaGS(rbw, newGS); return rbw; }