/** * This should be called by the member and should write a serialized root cause exception as * to the abort znode. */ @Override public void sendMemberAborted(Subprocedure sub, ForeignException ee) { if (sub == null) { LOG.error("Failed due to null subprocedure", ee); return; } String procName = sub.getName(); LOG.debug("Aborting procedure (" + procName + ") in zk"); String procAbortZNode = zkController.getAbortZNode(procName); try { String source = (ee.getSource() == null) ? memberName: ee.getSource(); byte[] errorInfo = ProtobufUtil.prependPBMagic(ForeignException.serialize(source, ee)); ZKUtil.createAndFailSilent(zkController.getWatcher(), procAbortZNode, errorInfo); LOG.debug("Finished creating abort znode:" + procAbortZNode); } catch (KeeperException e) { // possible that we get this error for the procedure if we already reset the zk state, but in // that case we should still get an error for that procedure anyways zkController.logZKTree(zkController.getBaseZnode()); member.controllerConnectionFailure("Failed to post zk node:" + procAbortZNode + " to abort procedure", e, procName); } }
/** * The connection to the rest of the procedure group (members and coordinator) has been * broken/lost/failed. This should fail any interested procedures, but not attempt to notify other * members since we cannot reach them anymore. * @param message description of the error * @param cause the actual cause of the failure */ void rpcConnectionFailure(final String message, final IOException cause) { Collection<Procedure> toNotify = procedures.values(); boolean isTraceEnabled = LOG.isTraceEnabled(); LOG.debug("received connection failure: " + message, cause); for (Procedure proc : toNotify) { if (proc == null) { continue; } // notify the elements, if they aren't null if (isTraceEnabled) { LOG.trace("connection failure - notify procedure: " + proc.getName()); } proc.receive(new ForeignException(proc.getName(), cause)); } }
/** * Wait for latch to count to zero, ignoring any spurious wake-ups, but waking periodically to * check for errors * @param latch latch to wait on * @param monitor monitor to check for errors while waiting * @param wakeFrequency frequency to wake up and check for errors (in * {@link TimeUnit#MILLISECONDS}) * @param latchDescription description of the latch, for logging * @throws ForeignException type of error the monitor can throw, if the task fails * @throws InterruptedException if we are interrupted while waiting on latch */ public static void waitForLatch(CountDownLatch latch, ForeignExceptionSnare monitor, long wakeFrequency, String latchDescription) throws ForeignException, InterruptedException { boolean released = false; while (!released) { if (monitor != null) { monitor.rethrowException(); } /* ForeignExceptionDispatcher.LOG.debug("Waiting for '" + latchDescription + "' latch. (sleep:" + wakeFrequency + " ms)"); */ released = latch.await(wakeFrequency, TimeUnit.MILLISECONDS); } // check error again in case an error raised during last wait if (monitor != null) { monitor.rethrowException(); } }
/** * This is the abort message being sent by the coordinator to member * * TODO this code isn't actually used but can be used to issue a cancellation from the * coordinator. */ @Override final public void sendAbortToMembers(Procedure proc, ForeignException ee) { String procName = proc.getName(); LOG.debug("Aborting procedure '" + procName + "' in zk"); String procAbortNode = zkProc.getAbortZNode(procName); try { LOG.debug("Creating abort znode:" + procAbortNode); String source = (ee.getSource() == null) ? coordName : ee.getSource(); byte[] errorInfo = ProtobufUtil.prependPBMagic(ForeignException.serialize(source, ee)); // first create the znode for the procedure ZKUtil.createAndFailSilent(zkProc.getWatcher(), procAbortNode, errorInfo); LOG.debug("Finished creating abort node:" + procAbortNode); } catch (KeeperException e) { // possible that we get this error for the procedure if we already reset the zk state, but in // that case we should still get an error for that procedure anyways zkProc.logZKTree(zkProc.baseZNode); coordinator.rpcConnectionFailure("Failed to post zk node:" + procAbortNode + " to abort procedure '" + procName + "'", new IOException(e)); } }
/** * Wait for the coordinator task to complete, and verify all the mocks * @param task to wait on * @throws Exception on unexpected failure */ private void waitAndVerifyProc(Procedure proc, VerificationMode prepare, VerificationMode commit, VerificationMode cleanup, VerificationMode finish, boolean opHasError) throws Exception { boolean caughtError = false; try { proc.waitForCompleted(); } catch (ForeignException fe) { caughtError = true; } // make sure that the task called all the expected phases Mockito.verify(proc, prepare).sendGlobalBarrierStart(); Mockito.verify(proc, commit).sendGlobalBarrierReached(); Mockito.verify(proc, finish).sendGlobalBarrierComplete(); assertEquals("Operation error state was unexpected", opHasError, proc.getErrorMonitor() .hasException()); assertEquals("Operation error state was unexpected", opHasError, caughtError); }
/** * Wait for the coordinator task to complete, and verify all the mocks * @param task to wait on * @throws Exception on unexpected failure */ private void waitAndVerifySubproc(Subprocedure op, VerificationMode prepare, VerificationMode commit, VerificationMode cleanup, VerificationMode finish, boolean opHasError) throws Exception { boolean caughtError = false; try { op.waitForLocallyCompleted(); } catch (ForeignException fe) { caughtError = true; } // make sure that the task called all the expected phases Mockito.verify(op, prepare).acquireBarrier(); Mockito.verify(op, commit).insideBarrier(); // We cannot guarantee that cleanup has run so we don't check it. assertEquals("Operation error state was unexpected", opHasError, op.getErrorCheckable() .hasException()); assertEquals("Operation error state was unexpected", opHasError, caughtError); }
@Test(timeout = 60000) public void testErrorPropagation() throws Exception { List<String> members = new ArrayList<String>(); members.add("member"); Procedure proc = new Procedure(coord, new ForeignExceptionDispatcher(), 100, Integer.MAX_VALUE, "op", null, members); final Procedure procspy = spy(proc); ForeignException cause = new ForeignException("SRC", "External Exception"); proc.receive(cause); // start the barrier procedure Thread t = new Thread() { public void run() { procspy.call(); } }; t.start(); t.join(); verify(procspy, never()).sendGlobalBarrierStart(); verify(procspy, never()).sendGlobalBarrierReached(); verify(procspy).sendGlobalBarrierComplete(); }
/** * This should be called by the member and should write a serialized root cause exception as * to the abort znode. */ @Override public void sendMemberAborted(Subprocedure sub, ForeignException ee) { if (sub == null) { LOG.error("Failed due to null subprocedure", ee); return; } String procName = sub.getName(); LOG.debug("Aborting procedure (" + procName + ") in zk"); String procAbortZNode = zkController.getAbortZNode(procName); try { String source = (ee.getSource() == null) ? memberName: ee.getSource(); byte[] errorInfo = ProtobufUtil.prependPBMagic(ForeignException.serialize(source, ee)); ZKUtil.createAndFailSilent(zkController.getWatcher(), procAbortZNode, errorInfo); LOG.debug("Finished creating abort znode:" + procAbortZNode); } catch (KeeperException e) { // possible that we get this error for the procedure if we already reset the zk state, but in // that case we should still get an error for that procedure anyways zkController.logZKTree(zkController.getBaseZnode()); member.controllerConnectionFailure("Failed to post zk node:" + procAbortZNode + " to abort procedure", new IOException(e)); } }
/** * Check that errors from running the task get propagated back to the error listener. */ @Test public void testErrorPropagation() throws Exception { ForeignExceptionDispatcher error = mock(ForeignExceptionDispatcher.class); SnapshotDescription snapshot = SnapshotDescription.newBuilder().setName("snapshot") .setTable("table").build(); final Exception thrown = new Exception("Failed!"); SnapshotTask fail = new SnapshotTask(snapshot, error) { @Override public Void call() { snapshotFailure("Injected failure", thrown); return null; } }; fail.call(); verify(error, Mockito.times(1)).receive(any(ForeignException.class)); }
@Override public void cancel(String why) { if (this.stopped) return; this.stopped = true; String msg = "Stopping restore snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) + " because: " + why; LOG.info(msg); CancellationException ce = new CancellationException(why); this.monitor.receive(new ForeignException(masterServices.getServerName().toString(), ce)); }
@Override public void cancel(String why) { if (this.stopped) return; this.stopped = true; String msg = "Stopping clone snapshot=" + snapshot + " because: " + why; LOG.info(msg); status.abort(msg); this.monitor.receive(new ForeignException(NAME, new CancellationException(why))); }
@Override public void cancel(String why) { if (finished) return; this.finished = true; LOG.info("Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) + " because: " + why); CancellationException ce = new CancellationException(why); monitor.receive(new ForeignException(master.getServerName().toString(), ce)); }
/** * Method to cancel the Subprocedure by injecting an exception from and external source. * @param cause */ public void cancel(String msg, Throwable cause) { LOG.error(msg, cause); complete = true; if (cause instanceof ForeignException) { monitor.receive((ForeignException) cause); } else { monitor.receive(new ForeignException(getMemberName(), cause)); } }
/** * Send abort to the specified procedure * @param procName name of the procedure to about * @param ee exception information about the abort */ public void receiveAbortProcedure(String procName, ForeignException ee) { LOG.debug("Request received to abort procedure " + procName, ee); // if we know about the procedure, notify it Subprocedure sub = subprocs.get(procName); if (sub == null) { LOG.info("Received abort on procedure with no local subprocedure " + procName + ", ignoring it.", ee); return; // Procedure has already completed } String msg = "Propagating foreign exception to subprocedure " + sub.getName(); LOG.error(msg, ee); sub.cancel(msg, ee); }
private void flushRegions() throws ForeignException { if (regions.isEmpty()) { // No regions on this RS, we are basically done. return; } monitor.rethrowException(); // assert that the taskManager is empty. if (taskManager.hasTasks()) { throw new IllegalStateException("Attempting to flush " + table + " but we currently have outstanding tasks"); } // Add all hfiles already existing in region. for (Region region : regions) { // submit one task per region for parallelize by region. taskManager.submitTask(new RegionFlushTask(region)); monitor.rethrowException(); } // wait for everything to complete. LOG.debug("Flush region tasks submitted for " + regions.size() + " regions"); try { taskManager.waitForOutstandingTasks(); } catch (InterruptedException e) { throw new ForeignException(getMemberName(), e); } }
/** * Abort the procedure with the given name * @param procName name of the procedure to abort * @param reason serialized information about the abort */ public void abortProcedure(String procName, ForeignException reason) { LOG.debug("abort procedure " + procName, reason); // if we know about the Procedure, notify it Procedure proc = procedures.get(procName); if (proc == null) { return; } proc.receive(reason); }
/** * Sends a message to all members that the global barrier condition has been satisfied. This * should only be executed after all members have completed its * {@link Subprocedure#acquireBarrier()} call successfully. This triggers the member * {@link Subprocedure#insideBarrier} method. * @throws ForeignException */ public void sendGlobalBarrierReached() throws ForeignException { try { // trigger to have member run {@link Subprocedure#insideBarrier} coord.getRpcs().sendGlobalBarrierReached(this, Lists.newArrayList(inBarrierMembers)); } catch (IOException e) { coord.rpcConnectionFailure("Can't reach controller.", e); } }
private void flushSnapshot() throws ForeignException { if (regions.isEmpty()) { // No regions on this RS, we are basically done. return; } monitor.rethrowException(); // assert that the taskManager is empty. if (taskManager.hasTasks()) { throw new IllegalStateException("Attempting to take snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot) + " but we currently have outstanding tasks"); } // Add all hfiles already existing in region. for (Region region : regions) { // submit one task per region for parallelize by region. taskManager.submitTask(new RegionSnapshotTask(region)); monitor.rethrowException(); } // wait for everything to complete. LOG.debug("Flush Snapshot Tasks submitted for " + regions.size() + " regions"); try { taskManager.waitForOutstandingTasks(); } catch (InterruptedException e) { LOG.error("got interrupted exception for " + getMemberName()); throw new ForeignException(getMemberName(), e); } }
/** * Test the normal sub procedure execution case. */ @Test(timeout = 500) public void testSimpleRun() throws Exception { member = buildCohortMember(); EmptySubprocedure subproc = new EmptySubprocedure(member, mockListener); EmptySubprocedure spy = spy(subproc); when(mockBuilder.buildSubprocedure(op, data)).thenReturn(spy); // when we get a prepare, then start the commit phase addCommitAnswer(); // run the operation // build a new operation Subprocedure subproc1 = member.createSubprocedure(op, data); member.submitSubprocedure(subproc1); // and wait for it to finish subproc.waitForLocallyCompleted(); // make sure everything ran in order InOrder order = inOrder(mockMemberComms, spy); order.verify(spy).acquireBarrier(); order.verify(mockMemberComms).sendMemberAcquired(eq(spy)); order.verify(spy).insideBarrier(); order.verify(mockMemberComms).sendMemberCompleted(eq(spy), eq(data)); order.verify(mockMemberComms, never()).sendMemberAborted(eq(spy), any(ForeignException.class)); }
/** * Check handling a connection failure correctly if we get it during the acquiring phase */ @Test(timeout = 60000) public void testUnreachableControllerDuringPrepare() throws Exception { coordinator = buildNewCoordinator(); // setup the proc List<String> expected = Arrays.asList("cohort"); Procedure proc = new Procedure(coordinator, WAKE_FREQUENCY, TIMEOUT, procName, procData, expected); final Procedure procSpy = spy(proc); when(coordinator.createProcedure(any(ForeignExceptionDispatcher.class), eq(procName), eq(procData), anyListOf(String.class))) .thenReturn(procSpy); // use the passed controller responses IOException cause = new IOException("Failed to reach comms during acquire"); doThrow(cause).when(controller) .sendGlobalBarrierAcquire(eq(procSpy), eq(procData), anyListOf(String.class)); // run the operation proc = coordinator.startProcedure(proc.getErrorMonitor(), procName, procData, expected); // and wait for it to finish while(!proc.completedLatch.await(WAKE_FREQUENCY, TimeUnit.MILLISECONDS)); verify(procSpy, atLeastOnce()).receive(any(ForeignException.class)); verify(coordinator, times(1)).rpcConnectionFailure(anyString(), eq(cause)); verify(controller, times(1)).sendGlobalBarrierAcquire(procSpy, procData, expected); verify(controller, never()).sendGlobalBarrierReached(any(Procedure.class), anyListOf(String.class)); }
/** * Check handling a connection failure correctly if we get it during the barrier phase */ @Test(timeout = 60000) public void testUnreachableControllerDuringCommit() throws Exception { coordinator = buildNewCoordinator(); // setup the task and spy on it List<String> expected = Arrays.asList("cohort"); final Procedure spy = spy(new Procedure(coordinator, WAKE_FREQUENCY, TIMEOUT, procName, procData, expected)); when(coordinator.createProcedure(any(ForeignExceptionDispatcher.class), eq(procName), eq(procData), anyListOf(String.class))) .thenReturn(spy); // use the passed controller responses IOException cause = new IOException("Failed to reach controller during prepare"); doAnswer(new AcquireBarrierAnswer(procName, new String[] { "cohort" })) .when(controller).sendGlobalBarrierAcquire(eq(spy), eq(procData), anyListOf(String.class)); doThrow(cause).when(controller).sendGlobalBarrierReached(eq(spy), anyListOf(String.class)); // run the operation Procedure task = coordinator.startProcedure(spy.getErrorMonitor(), procName, procData, expected); // and wait for it to finish while(!task.completedLatch.await(WAKE_FREQUENCY, TimeUnit.MILLISECONDS)); verify(spy, atLeastOnce()).receive(any(ForeignException.class)); verify(coordinator, times(1)).rpcConnectionFailure(anyString(), eq(cause)); verify(controller, times(1)).sendGlobalBarrierAcquire(eq(spy), eq(procData), anyListOf(String.class)); verify(controller, times(1)).sendGlobalBarrierReached(any(Procedure.class), anyListOf(String.class)); }
@Test(timeout = 60000) public void testBarrieredErrorPropagation() throws Exception { List<String> members = new ArrayList<String>(); members.add("member"); LatchedProcedure proc = new LatchedProcedure(coord, new ForeignExceptionDispatcher(), 100, Integer.MAX_VALUE, "op", null, members); final LatchedProcedure procspy = spy(proc); // start the barrier procedure Thread t = new Thread() { public void run() { procspy.call(); } }; t.start(); // now test that we can put an error in before the commit phase runs procspy.startedAcquireBarrier.await(); ForeignException cause = new ForeignException("SRC", "External Exception"); procspy.receive(cause); procspy.barrierAcquiredByMember(members.get(0)); t.join(); // verify state of all the object verify(procspy).sendGlobalBarrierStart(); verify(procspy).sendGlobalBarrierComplete(); verify(procspy, never()).sendGlobalBarrierReached(); }
@Override public void cancel(String why) { if (this.stopped) return; this.stopped = true; String msg = "Stopping restore snapshot=" + SnapshotDescriptionUtils.toString(snapshot) + " because: " + why; LOG.info(msg); CancellationException ce = new CancellationException(why); this.monitor.receive(new ForeignException(masterServices.getServerName().toString(), ce)); }
/** * The connection to the rest of the procedure group (members and coordinator) has been * broken/lost/failed. This should fail any interested procedures, but not attempt to notify other * members since we cannot reach them anymore. * @param message description of the error * @param cause the actual cause of the failure */ void rpcConnectionFailure(final String message, final IOException cause) { Collection<Procedure> toNotify = procedures.values(); for (Procedure proc : toNotify) { if (proc == null) { continue; } // notify the elements, if they aren't null proc.receive(new ForeignException(proc.getName(), cause)); } }
/** * Method to cancel the Subprocedure by injecting an exception from and external source. * @param cause */ public void cancel(String msg, Throwable cause) { LOG.error(msg, cause); if (cause instanceof ForeignException) { monitor.receive((ForeignException) cause); } else { monitor.receive(new ForeignException(getMemberName(), cause)); } }
/** * Abort the procedure with the given name * @param procName name of the procedure to abort * @param reason serialized information about the abort */ public void abortProcedure(String procName, ForeignException reason) { // if we know about the Procedure, notify it synchronized(procedures) { Procedure proc = procedures.get(procName); if (proc == null) { return; } proc.receive(reason); } }