/** * Requests that the NameNode download an image from this node. Allows for * optional external cancelation. * * @param fsName the http address for the remote NN * @param conf Configuration * @param storage the storage directory to transfer the image from * @param nnf the NameNodeFile type of the image * @param txid the transaction ID of the image to be uploaded * @param canceler optional canceler to check for abort of upload * @throws IOException if there is an I/O error or cancellation */ public static void uploadImageFromStorage(URL fsName, Configuration conf, NNStorage storage, NameNodeFile nnf, long txid, Canceler canceler) throws IOException { URL url = new URL(fsName, ImageServlet.PATH_SPEC); long startTime = Time.monotonicNow(); try { uploadImage(url, conf, storage, nnf, txid, canceler); } catch (HttpPutFailedException e) { if (e.getResponseCode() == HttpServletResponse.SC_CONFLICT) { // this is OK - this means that a previous attempt to upload // this checkpoint succeeded even though we thought it failed. LOG.info("Image upload with txid " + txid + " conflicted with a previous image upload to the " + "same NameNode. Continuing...", e); return; } else { throw e; } } double xferSec = Math.max( ((float) (Time.monotonicNow() - startTime)) / 1000.0, 0.001); LOG.info("Uploaded image with txid " + txid + " to namenode at " + fsName + " in " + xferSec + " seconds"); }
/** * Test for the case when the SBN is configured to checkpoint based * on a time period, but no transactions are happening on the * active. Thus, it would want to save a second checkpoint at the * same txid, which is a no-op. This test makes sure this doesn't * cause any problem. */ @Test(timeout = 300000) public void testCheckpointWhenNoNewTransactionsHappened() throws Exception { // Checkpoint as fast as we can, in a tight loop. cluster.getConfiguration(1).setInt( DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 0); cluster.restartNameNode(1); nn1 = cluster.getNameNode(1); FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1); // We shouldn't save any checkpoints at txid=0 Thread.sleep(1000); Mockito.verify(spyImage1, Mockito.never()) .saveNamespace((FSNamesystem) Mockito.anyObject()); // Roll the primary and wait for the standby to catch up HATestUtil.waitForStandbyToCatchUp(nn0, nn1); Thread.sleep(2000); // We should make exactly one checkpoint at this new txid. Mockito.verify(spyImage1, Mockito.times(1)).saveNamespace( (FSNamesystem) Mockito.anyObject(), Mockito.eq(NameNodeFile.IMAGE), (Canceler) Mockito.anyObject()); }
/** * Requests that the NameNode download an image from this node. Allows for * optional external cancelation. * * @param fsName the http address for the remote NN * @param conf Configuration * @param storage the storage directory to transfer the image from * @param nnf the NameNodeFile type of the image * @param txid the transaction ID of the image to be uploaded * @param canceler optional canceler to check for abort of upload * @throws IOException if there is an I/O error or cancellation */ public static TransferResult uploadImageFromStorage(URL fsName, Configuration conf, NNStorage storage, NameNodeFile nnf, long txid, Canceler canceler) throws IOException { URL url = new URL(fsName, ImageServlet.PATH_SPEC); long startTime = Time.monotonicNow(); try { uploadImage(url, conf, storage, nnf, txid, canceler); } catch (HttpPutFailedException e) { // translate the error code to a result, which is a bit more obvious in usage TransferResult result = TransferResult.getResultForCode(e.getResponseCode()); if (result.shouldReThrowException) { throw e; } return result; } double xferSec = Math.max( ((float) (Time.monotonicNow() - startTime)) / 1000.0, 0.001); LOG.info("Uploaded image with txid " + txid + " to namenode at " + fsName + " in " + xferSec + " seconds"); return TransferResult.SUCCESS; }
/** * Test for the case when the SBN is configured to checkpoint based * on a time period, but no transactions are happening on the * active. Thus, it would want to save a second checkpoint at the * same txid, which is a no-op. This test makes sure this doesn't * cause any problem. */ @Test(timeout = 300000) public void testCheckpointWhenNoNewTransactionsHappened() throws Exception { // Checkpoint as fast as we can, in a tight loop. cluster.getConfiguration(1).setInt( DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 0); cluster.restartNameNode(1); nns[1] = cluster.getNameNode(1); FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nns[1]); // We shouldn't save any checkpoints at txid=0 Thread.sleep(1000); Mockito.verify(spyImage1, Mockito.never()) .saveNamespace((FSNamesystem) Mockito.anyObject()); // Roll the primary and wait for the standby to catch up HATestUtil.waitForStandbyToCatchUp(nns[0], nns[1]); Thread.sleep(2000); // We should make exactly one checkpoint at this new txid. Mockito.verify(spyImage1, Mockito.times(1)).saveNamespace( (FSNamesystem) Mockito.anyObject(), Mockito.eq(NameNodeFile.IMAGE), (Canceler) Mockito.anyObject()); }
/** * Save the contents of the FS image to a new image file in each of the * current storage directories. * @param canceler */ public synchronized void saveNamespace(FSNamesystem source, Canceler canceler) throws IOException { assert editLog != null : "editLog must be initialized"; storage.attemptRestoreRemovedStorage(); boolean editLogWasOpen = editLog.isSegmentOpen(); if (editLogWasOpen) { editLog.endCurrentLogSegment(true); } long imageTxId = getLastAppliedOrWrittenTxId(); try { saveFSImageInAllDirs(source, imageTxId, canceler); storage.writeAll(); } finally { if (editLogWasOpen) { editLog.startLogSegment(imageTxId + 1, true); // Take this opportunity to note the current transaction. // Even if the namespace save was cancelled, this marker // is only used to determine what transaction ID is required // for startup. So, it doesn't hurt to update it unnecessarily. storage.writeTransactionIdFileToStorage(imageTxId + 1); } } }
/** * Test for the case when the SBN is configured to checkpoint based * on a time period, but no transactions are happening on the * active. Thus, it would want to save a second checkpoint at the * same txid, which is a no-op. This test makes sure this doesn't * cause any problem. */ @Test public void testCheckpointWhenNoNewTransactionsHappened() throws Exception { // Checkpoint as fast as we can, in a tight loop. cluster.getConfiguration(1).setInt( DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 0); cluster.restartNameNode(1); nn1 = cluster.getNameNode(1); FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1); // We shouldn't save any checkpoints at txid=0 Thread.sleep(1000); Mockito.verify(spyImage1, Mockito.never()) .saveNamespace((FSNamesystem) Mockito.anyObject()); // Roll the primary and wait for the standby to catch up HATestUtil.waitForStandbyToCatchUp(nn0, nn1); Thread.sleep(2000); // We should make exactly one checkpoint at this new txid. Mockito.verify(spyImage1, Mockito.times(1)) .saveNamespace((FSNamesystem) Mockito.anyObject(), (Canceler)Mockito.anyObject()); }
/** * Test for the case when the SBN is configured to checkpoint based * on a time period, but no transactions are happening on the * active. Thus, it would want to save a second checkpoint at the * same txid, which is a no-op. This test makes sure this doesn't * cause any problem. */ @Test public void testCheckpointWhenNoNewTransactionsHappened() throws Exception { // Checkpoint as fast as we can, in a tight loop. cluster.getConfiguration(1).setInt( DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 0); cluster.restartNameNode(1); nn1 = cluster.getNameNode(1); FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1); // We shouldn't save any checkpoints at txid=0 Thread.sleep(1000); Mockito.verify(spyImage1, Mockito.never()) .saveNamespace((FSNamesystem) Mockito.anyObject()); // Roll the primary and wait for the standby to catch up HATestUtil.waitForStandbyToCatchUp(nn0, nn1); Thread.sleep(2000); // We should make exactly one checkpoint at this new txid. Mockito.verify(spyImage1, Mockito.times(1)).saveNamespace( (FSNamesystem) Mockito.anyObject(), Mockito.eq(NameNodeFile.IMAGE), (Canceler) Mockito.anyObject()); }
SaveNamespaceContext( FSNamesystem sourceNamesystem, long txid, Canceler canceller) { this.sourceNamesystem = sourceNamesystem; this.txid = txid; this.canceller = canceller; }
private static void writeFileToPutRequest(Configuration conf, HttpURLConnection connection, File imageFile, Canceler canceler) throws FileNotFoundException, IOException { connection.setRequestProperty(CONTENT_TYPE, "application/octet-stream"); connection.setRequestProperty(CONTENT_TRANSFER_ENCODING, "binary"); OutputStream output = connection.getOutputStream(); FileInputStream input = new FileInputStream(imageFile); try { copyFileToStream(output, imageFile, input, ImageServlet.getThrottler(conf), canceler); } finally { IOUtils.closeStream(input); IOUtils.closeStream(output); } }
/** * Save FSimage in the legacy format. This is not for NN consumption, * but for tools like OIV. */ public void saveLegacyOIVImage(FSNamesystem source, String targetDir, Canceler canceler) throws IOException { FSImageCompression compression = FSImageCompression.createCompression(conf); long txid = getLastAppliedOrWrittenTxId(); SaveNamespaceContext ctx = new SaveNamespaceContext(source, txid, canceler); FSImageFormat.Saver saver = new FSImageFormat.Saver(ctx); String imageFileName = NNStorage.getLegacyOIVImageFileName(txid); File imageFile = new File(targetDir, imageFileName); saver.save(imageFile, compression); archivalManager.purgeOldLegacyOIVImages(targetDir, txid); }
/** * Save the contents of the FS image to a new image file in each of the * current storage directories. */ public synchronized void saveNamespace(FSNamesystem source, NameNodeFile nnf, Canceler canceler) throws IOException { assert editLog != null : "editLog must be initialized"; LOG.info("Save namespace ..."); storage.attemptRestoreRemovedStorage(); boolean editLogWasOpen = editLog.isSegmentOpen(); if (editLogWasOpen) { editLog.endCurrentLogSegment(true); } long imageTxId = getLastAppliedOrWrittenTxId(); if (!addToCheckpointing(imageTxId)) { throw new IOException( "FS image is being downloaded from another NN at txid " + imageTxId); } try { try { saveFSImageInAllDirs(source, nnf, imageTxId, canceler); storage.writeAll(); } finally { if (editLogWasOpen) { editLog.startLogSegment(imageTxId + 1, true); // Take this opportunity to note the current transaction. // Even if the namespace save was cancelled, this marker // is only used to determine what transaction ID is required // for startup. So, it doesn't hurt to update it unnecessarily. storage.writeTransactionIdFileToStorage(imageTxId + 1); } } } finally { removeFromCheckpointing(imageTxId); } }
/** Save the fsimage to a temp file */ private File saveFSImageToTempFile() throws IOException { SaveNamespaceContext context = new SaveNamespaceContext(fsn, txid, new Canceler()); FSImageFormatProtobuf.Saver saver = new FSImageFormatProtobuf.Saver(context); FSImageCompression compression = FSImageCompression.createCompression(conf); File imageFile = getImageFile(testDir, txid); fsn.readLock(); try { saver.save(imageFile, compression); } finally { fsn.readUnlock(); } return imageFile; }
/** * Save the contents of the FS image to a new image file in each of the * current storage directories. */ public synchronized void saveNamespace(FSNamesystem source, NameNodeFile nnf, Canceler canceler) throws IOException { assert editLog != null : "editLog must be initialized"; LOG.info("Save namespace ..."); storage.attemptRestoreRemovedStorage(); boolean editLogWasOpen = editLog.isSegmentOpen(); if (editLogWasOpen) { editLog.endCurrentLogSegment(true); } long imageTxId = getLastAppliedOrWrittenTxId(); if (!addToCheckpointing(imageTxId)) { throw new IOException( "FS image is being downloaded from another NN at txid " + imageTxId); } try { try { saveFSImageInAllDirs(source, nnf, imageTxId, canceler); if (!source.isRollingUpgrade()) { storage.writeAll(); } } finally { if (editLogWasOpen) { editLog.startLogSegmentAndWriteHeaderTxn(imageTxId + 1, source.getEffectiveLayoutVersion()); // Take this opportunity to note the current transaction. // Even if the namespace save was cancelled, this marker // is only used to determine what transaction ID is required // for startup. So, it doesn't hurt to update it unnecessarily. storage.writeTransactionIdFileToStorage(imageTxId + 1); } } } finally { removeFromCheckpointing(imageTxId); } //Update NameDirSize Metric getStorage().updateNameDirSize(); }
/** * Save the contents of the FS image to a new image file in each of the * current storage directories. */ public synchronized void saveNamespace(FSNamesystem source, NameNodeFile nnf, Canceler canceler) throws IOException { assert editLog != null : "editLog must be initialized"; LOG.info("Save namespace ..."); storage.attemptRestoreRemovedStorage(); boolean editLogWasOpen = editLog.isSegmentOpen(); if (editLogWasOpen) { editLog.endCurrentLogSegment(true); } long imageTxId = getLastAppliedOrWrittenTxId(); try { saveFSImageInAllDirs(source, nnf, imageTxId, canceler); storage.writeAll(); } finally { if (editLogWasOpen) { editLog.startLogSegment(imageTxId + 1, true); // Take this opportunity to note the current transaction. // Even if the namespace save was cancelled, this marker // is only used to determine what transaction ID is required // for startup. So, it doesn't hurt to update it unnecessarily. storage.writeTransactionIdFileToStorage(imageTxId + 1); } } }
/** Save the fsimage to a temp file */ private File saveFSImageToTempFile() throws IOException { SaveNamespaceContext context = new SaveNamespaceContext(fsn, txid, new Canceler()); FSImageFormat.Saver saver = new FSImageFormat.Saver(context); FSImageCompression compression = FSImageCompression.createCompression(conf); File imageFile = getImageFile(testDir, txid); fsn.readLock(); try { saver.save(imageFile, compression); } finally { fsn.readUnlock(); } return imageFile; }
/** * Make sure that clients will receive StandbyExceptions even when a * checkpoint is in progress on the SBN, and therefore the StandbyCheckpointer * thread will have FSNS lock. Regression test for HDFS-4591. */ @Test(timeout=300000) public void testStandbyExceptionThrownDuringCheckpoint() throws Exception { // Set it up so that we know when the SBN checkpoint starts and ends. FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1); DelayAnswer answerer = new DelayAnswer(LOG); Mockito.doAnswer(answerer).when(spyImage1) .saveNamespace(Mockito.any(FSNamesystem.class), Mockito.any(Canceler.class)); // Perform some edits and wait for a checkpoint to start on the SBN. doEdits(0, 1000); nn0.getRpcServer().rollEditLog(); answerer.waitForCall(); answerer.proceed(); assertTrue("SBN is not performing checkpoint but it should be.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0); // Make sure that the lock has actually been taken by the checkpointing // thread. ThreadUtil.sleepAtLeastIgnoreInterrupts(1000); try { // Perform an RPC to the SBN and make sure it throws a StandbyException. nn1.getRpcServer().getFileInfo("/"); fail("Should have thrown StandbyException, but instead succeeded."); } catch (StandbyException se) { GenericTestUtils.assertExceptionContains("is not supported", se); } // Make sure that the checkpoint is still going on, implying that the client // RPC to the SBN happened during the checkpoint. assertTrue("SBN should have still been checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0); answerer.waitForResult(); assertTrue("SBN should have finished checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 1); }