@Test(timeout=20000) public void testCrashBetweenSyncLogAndPersistPaxosData() throws Exception { JournalFaultInjector faultInjector = JournalFaultInjector.instance = Mockito.mock(JournalFaultInjector.class); setupLoggers345(); // Run recovery where the client only talks to JN0, JN1, such that it // decides that the correct length is through txid 4. // Only allow it to call acceptRecovery() on JN0. qjm = createSpyingQJM(); spies = qjm.getLoggerSetForTests().getLoggersForTests(); cluster.getJournalNode(2).stopAndJoin(0); injectIOE().when(spies.get(1)).acceptRecovery( Mockito.<SegmentStateProto>any(), Mockito.<URL>any()); tryRecoveryExpectingFailure(); cluster.restartJournalNode(2); // State at this point: // JN0: edit log for 1-4, paxos recovery data for txid 4 // JN1: edit log for 1-4, // JN2: edit log for 1-5 // Run recovery again, but don't allow JN0 to respond to the // prepareRecovery() call. This will cause recovery to decide // on txid 5. // Additionally, crash all of the nodes before they persist // any new paxos data. qjm = createSpyingQJM(); spies = qjm.getLoggerSetForTests().getLoggersForTests(); injectIOE().when(spies.get(0)).prepareRecovery(Mockito.eq(1L)); Mockito.doThrow(new IOException("Injected")).when(faultInjector) .beforePersistPaxosData(); tryRecoveryExpectingFailure(); Mockito.reset(faultInjector); // State at this point: // JN0: edit log for 1-5, paxos recovery data for txid 4 // !!! This is the interesting bit, above. The on-disk data and the // paxos data don't match up! // JN1: edit log for 1-5, // JN2: edit log for 1-5, // Now, stop JN2, and see if we can still start up even though // JN0 is in a strange state where its log data is actually newer // than its accepted Paxos state. cluster.getJournalNode(2).stopAndJoin(0); qjm = createSpyingQJM(); try { long recovered = QJMTestUtil.recoverAndReturnLastTxn(qjm); assertTrue(recovered >= 4); // 4 was committed to a quorum } finally { qjm.close(); } }
@Test(timeout=20000) public void testCrashBetweenSyncLogAndPersistPaxosData() throws Exception { JournalFaultInjector faultInjector = JournalFaultInjector.instance = Mockito.mock(JournalFaultInjector.class); setupLoggers345(); // Run recovery where the client only talks to JN0, JN1, such that it // decides that the correct length is through txid 4. // Only allow it to call acceptRecovery() on JN0. qjm = createSpyingQJM(); spies = qjm.getLoggerSetForTests().getLoggersForTests(); cluster.getJournalNode(2).stopAndJoin(0); injectIOE().when(spies.get(1)).acceptRecovery( Mockito.<SegmentStateProto>any(), Mockito.<String>any()); tryRecoveryExpectingFailure(); cluster.restartJournalNode(2); // State at this point: // JN0: edit log for 1-4, paxos recovery data for txid 4 // JN1: edit log for 1-4, // JN2: edit log for 1-5 // Run recovery again, but don't allow JN0 to respond to the // prepareRecovery() call. This will cause recovery to decide // on txid 5. // Additionally, crash all of the nodes before they persist // any new paxos data. qjm = createSpyingQJM(); spies = qjm.getLoggerSetForTests().getLoggersForTests(); injectIOE().when(spies.get(0)).prepareRecovery(Mockito.eq(1L)); Mockito.doThrow(new IOException("Injected")).when(faultInjector) .beforePersistPaxosData(); tryRecoveryExpectingFailure(); Mockito.reset(faultInjector); // State at this point: // JN0: edit log for 1-5, paxos recovery data for txid 4 // !!! This is the interesting bit, above. The on-disk data and the // paxos data don't match up! // JN1: edit log for 1-5, // JN2: edit log for 1-5, // Now, stop JN2, and see if we can still start up even though // JN0 is in a strange state where its log data is actually newer // than its accepted Paxos state. cluster.getJournalNode(2).stopAndJoin(0); qjm = createSpyingQJM(); try { long recovered = QJMTestUtil.recoverAndReturnLastTxn(qjm); assertTrue(recovered >= 4); // 4 was committed to a quorum } finally { qjm.close(); } }