private void testMRAppHistory(MRApp app) throws Exception { Configuration conf = new Configuration(); Job job = app.submit(conf); app.waitForState(job, JobState.FAILED); Map<TaskId, Task> tasks = job.getTasks(); Assert.assertEquals("Num tasks is not correct", 1, tasks.size()); Task task = tasks.values().iterator().next(); Assert.assertEquals("Task state not correct", TaskState.FAILED, task .getReport().getTaskState()); Map<TaskAttemptId, TaskAttempt> attempts = tasks.values().iterator().next() .getAttempts(); Assert.assertEquals("Num attempts is not correct", 4, attempts.size()); Iterator<TaskAttempt> it = attempts.values().iterator(); TaskAttemptReport report = it.next().getReport(); Assert.assertEquals("Attempt state not correct", TaskAttemptState.FAILED, report.getTaskAttemptState()); Assert.assertEquals("Diagnostic Information is not Correct", "Test Diagnostic Event", report.getDiagnosticInfo()); report = it.next().getReport(); Assert.assertEquals("Attempt state not correct", TaskAttemptState.FAILED, report.getTaskAttemptState()); }
@Test public void testKillDuringTaskAttemptCommit() { mockTask = createMockTask(TaskType.REDUCE); TaskId taskId = getNewTaskID(); scheduleTaskAttempt(taskId); launchTaskAttempt(getLastAttempt().getAttemptId()); updateLastAttemptState(TaskAttemptState.COMMIT_PENDING); commitTaskAttempt(getLastAttempt().getAttemptId()); TaskAttemptId commitAttempt = getLastAttempt().getAttemptId(); updateLastAttemptState(TaskAttemptState.KILLED); killRunningTaskAttempt(commitAttempt); assertFalse(mockTask.canCommit(commitAttempt)); }
public void waitForState(TaskAttempt attempt, TaskAttemptState finalState) throws Exception { int timeoutSecs = 0; TaskAttemptReport report = attempt.getReport(); while (!finalState.equals(report.getTaskAttemptState()) && timeoutSecs++ < 20) { System.out.println("TaskAttempt State is : " + report.getTaskAttemptState() + " Waiting for state : " + finalState + " progress : " + report.getProgress()); report = attempt.getReport(); Thread.sleep(500); } System.out.println("TaskAttempt State is : " + report.getTaskAttemptState()); Assert.assertEquals("TaskAttempt state is not correct (timedout)", finalState, report.getTaskAttemptState()); }
@Test //First attempt is failed and second attempt is passed //The job succeeds. public void testFailTask() throws Exception { MRApp app = new MockFirstFailingAttemptMRApp(1, 0); Configuration conf = new Configuration(); // this test requires two task attempts, but uberization overrides max to 1 conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); Job job = app.submit(conf); app.waitForState(job, JobState.SUCCEEDED); Map<TaskId,Task> tasks = job.getTasks(); Assert.assertEquals("Num tasks is not correct", 1, tasks.size()); Task task = tasks.values().iterator().next(); Assert.assertEquals("Task state not correct", TaskState.SUCCEEDED, task.getReport().getTaskState()); Map<TaskAttemptId, TaskAttempt> attempts = tasks.values().iterator().next().getAttempts(); Assert.assertEquals("Num attempts is not correct", 2, attempts.size()); //one attempt must be failed //and another must have succeeded Iterator<TaskAttempt> it = attempts.values().iterator(); Assert.assertEquals("Attempt state not correct", TaskAttemptState.FAILED, it.next().getReport().getTaskAttemptState()); Assert.assertEquals("Attempt state not correct", TaskAttemptState.SUCCEEDED, it.next().getReport().getTaskAttemptState()); }
@Test //All Task attempts are timed out, leading to Job failure public void testTimedOutTask() throws Exception { MRApp app = new TimeOutTaskMRApp(1, 0); Configuration conf = new Configuration(); int maxAttempts = 2; conf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, maxAttempts); // disable uberization (requires entire job to be reattempted, so max for // subtask attempts is overridden to 1) conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false); Job job = app.submit(conf); app.waitForState(job, JobState.FAILED); Map<TaskId,Task> tasks = job.getTasks(); Assert.assertEquals("Num tasks is not correct", 1, tasks.size()); Task task = tasks.values().iterator().next(); Assert.assertEquals("Task state not correct", TaskState.FAILED, task.getReport().getTaskState()); Map<TaskAttemptId, TaskAttempt> attempts = tasks.values().iterator().next().getAttempts(); Assert.assertEquals("Num attempts is not correct", maxAttempts, attempts.size()); for (TaskAttempt attempt : attempts.values()) { Assert.assertEquals("Attempt state not correct", TaskAttemptState.FAILED, attempt.getReport().getTaskAttemptState()); } }
public static TaskAttemptState toYarn( org.apache.hadoop.mapred.TaskStatus.State state) { switch (state) { case COMMIT_PENDING: return TaskAttemptState.COMMIT_PENDING; case FAILED: case FAILED_UNCLEAN: return TaskAttemptState.FAILED; case KILLED: case KILLED_UNCLEAN: return TaskAttemptState.KILLED; case RUNNING: return TaskAttemptState.RUNNING; case SUCCEEDED: return TaskAttemptState.SUCCEEDED; case UNASSIGNED: return TaskAttemptState.STARTING; default: throw new YarnRuntimeException("Unrecognized State: " + state); } }
@Test public void testTimeoutWhileSuccessFinishing() throws Exception { MockEventHandler eventHandler = new MockEventHandler(); TaskAttemptImpl taImpl = createTaskAttemptImpl(eventHandler); taImpl.handle(new TaskAttemptEvent(taImpl.getID(), TaskAttemptEventType.TA_DONE)); assertEquals("Task attempt is not in RUNNING state", taImpl.getState(), TaskAttemptState.SUCCEEDED); assertEquals("Task attempt's internal state is not " + "SUCCESS_FINISHING_CONTAINER", taImpl.getInternalState(), TaskAttemptStateInternal.SUCCESS_FINISHING_CONTAINER); // If the task stays in SUCCESS_FINISHING_CONTAINER for too long, // TaskAttemptListenerImpl will time out the attempt. taImpl.handle(new TaskAttemptEvent(taImpl.getID(), TaskAttemptEventType.TA_TIMED_OUT)); assertEquals("Task attempt is not in RUNNING state", taImpl.getState(), TaskAttemptState.SUCCEEDED); assertEquals("Task attempt's internal state is not " + "SUCCESS_CONTAINER_CLEANUP", taImpl.getInternalState(), TaskAttemptStateInternal.SUCCESS_CONTAINER_CLEANUP); assertFalse("InternalError occurred", eventHandler.internalError); }
@Test public void testTimeoutWhileFailFinishing() throws Exception { MockEventHandler eventHandler = new MockEventHandler(); TaskAttemptImpl taImpl = createTaskAttemptImpl(eventHandler); taImpl.handle(new TaskAttemptEvent(taImpl.getID(), TaskAttemptEventType.TA_FAILMSG)); assertEquals("Task attempt is not in RUNNING state", taImpl.getState(), TaskAttemptState.FAILED); assertEquals("Task attempt's internal state is not " + "FAIL_FINISHING_CONTAINER", taImpl.getInternalState(), TaskAttemptStateInternal.FAIL_FINISHING_CONTAINER); // If the task stays in FAIL_FINISHING_CONTAINER for too long, // TaskAttemptListenerImpl will time out the attempt. taImpl.handle(new TaskAttemptEvent(taImpl.getID(), TaskAttemptEventType.TA_TIMED_OUT)); assertEquals("Task attempt's internal state is not FAIL_CONTAINER_CLEANUP", taImpl.getInternalState(), TaskAttemptStateInternal.FAIL_CONTAINER_CLEANUP); assertFalse("InternalError occurred", eventHandler.internalError); }
/** * Absorbs one TaskAttemptStatus * * @param reportedStatus the status report that we got from a task attempt * that we want to fold into the speculation data for this job * @param timestamp the time this status corresponds to. This matters * because statuses contain progress. */ protected void statusUpdate(TaskAttemptStatus reportedStatus, long timestamp) { String stateString = reportedStatus.taskState.toString(); TaskAttemptId attemptID = reportedStatus.id; TaskId taskID = attemptID.getTaskId(); Job job = context.getJob(taskID.getJobId()); if (job == null) { return; } Task task = job.getTask(taskID); if (task == null) { return; } estimator.updateAttempt(reportedStatus, timestamp); if (stateString.equals(TaskAttemptState.RUNNING.name())) { runningTasks.putIfAbsent(taskID, Boolean.TRUE); } else { runningTasks.remove(taskID, Boolean.TRUE); if (!stateString.equals(TaskAttemptState.STARTING.name())) { runningTaskAttemptStatistics.remove(attemptID); } } }
private TaskAttempt getSuccessfulAttempt(Task task) { for (TaskAttempt attempt : task.getAttempts().values()) { if (attempt.getState() == TaskAttemptState.SUCCEEDED) { return attempt; } } return null; }
@Override public TaskAttemptState getState() { readLock.lock(); try { return getExternalState(stateMachine.getCurrentState()); } finally { readLock.unlock(); } }
private static TaskAttemptState getExternalState( TaskAttemptStateInternal smState) { switch (smState) { case ASSIGNED: case UNASSIGNED: return TaskAttemptState.STARTING; case COMMIT_PENDING: return TaskAttemptState.COMMIT_PENDING; case FAILED: return TaskAttemptState.FAILED; case KILLED: return TaskAttemptState.KILLED; // All CLEANUP states considered as RUNNING since events have not gone out // to the Task yet. May be possible to consider them as a Finished state. case FAIL_CONTAINER_CLEANUP: case FAIL_TASK_CLEANUP: case KILL_CONTAINER_CLEANUP: case KILL_TASK_CLEANUP: case SUCCESS_CONTAINER_CLEANUP: case RUNNING: return TaskAttemptState.RUNNING; case NEW: return TaskAttemptState.NEW; case SUCCEEDED: return TaskAttemptState.SUCCEEDED; default: throw new YarnRuntimeException("Attempt to convert invalid " + "stateMachineTaskAttemptState to externalTaskAttemptState: " + smState); } }
private void initTaskAttemptStatus(TaskAttemptStatus result) { result.progress = 0.0f; result.phase = Phase.STARTING; result.stateString = "NEW"; result.taskState = TaskAttemptState.NEW; Counters counters = EMPTY_COUNTERS; result.counters = counters; }
private void verifyTaskAttemptReport(TaskAttemptReport tar) { Assert.assertEquals(TaskAttemptState.RUNNING, tar.getTaskAttemptState()); Assert.assertNotNull("TaskAttemptReport is null", tar); Assert.assertEquals(MRApp.NM_HOST, tar.getNodeManagerHost()); Assert.assertEquals(MRApp.NM_PORT, tar.getNodeManagerPort()); Assert.assertEquals(MRApp.NM_HTTP_PORT, tar.getNodeManagerHttpPort()); Assert.assertEquals(1, tar.getContainerId().getApplicationAttemptId() .getAttemptId()); }
@Override public boolean isFinished() { for (TaskAttempt attempt : attempts.values()) { if (attempt.getState() == TaskAttemptState.SUCCEEDED) { return true; } } return false; }
@Override public float getProgress() { if (overridingState == TaskAttemptState.NEW) { return 0.0F; } return myAttemptID.getTaskId().getTaskType() == TaskType.MAP ? getMapProgress() : getReduceProgress(); }
private TaskAttemptInfo getMockTaskAttemptInfo(TaskAttemptID tai, TaskAttemptState tas) { ContainerId ci = mock(ContainerId.class); Counters counters = mock(Counters.class); TaskType tt = TaskType.MAP; long finishTime = System.currentTimeMillis(); TaskAttemptInfo mockTAinfo = mock(TaskAttemptInfo.class); when(mockTAinfo.getAttemptId()).thenReturn(tai); when(mockTAinfo.getContainerId()).thenReturn(ci); when(mockTAinfo.getCounters()).thenReturn(counters); when(mockTAinfo.getError()).thenReturn(""); when(mockTAinfo.getFinishTime()).thenReturn(finishTime); when(mockTAinfo.getHostname()).thenReturn("localhost"); when(mockTAinfo.getHttpPort()).thenReturn(23); when(mockTAinfo.getMapFinishTime()).thenReturn(finishTime - 1000L); when(mockTAinfo.getPort()).thenReturn(24); when(mockTAinfo.getRackname()).thenReturn("defaultRack"); when(mockTAinfo.getShuffleFinishTime()).thenReturn(finishTime - 2000L); when(mockTAinfo.getShufflePort()).thenReturn(25); when(mockTAinfo.getSortFinishTime()).thenReturn(finishTime - 3000L); when(mockTAinfo.getStartTime()).thenReturn(finishTime -10000); when(mockTAinfo.getState()).thenReturn("task in progress"); when(mockTAinfo.getTaskStatus()).thenReturn(tas.toString()); when(mockTAinfo.getTaskType()).thenReturn(tt); when(mockTAinfo.getTrackerName()).thenReturn("TrackerName"); return mockTAinfo; }
@Test public void testKillJob() throws Exception { final CountDownLatch latch = new CountDownLatch(1); MRApp app = new BlockingMRApp(1, 0, latch); //this will start the job but job won't complete as task is //blocked Job job = app.submit(new Configuration()); //wait and vailidate for Job to become RUNNING app.waitForState(job, JobState.RUNNING); //send the kill signal to Job app.getContext().getEventHandler().handle( new JobEvent(job.getID(), JobEventType.JOB_KILL)); //unblock Task latch.countDown(); //wait and validate for Job to be KILLED app.waitForState(job, JobState.KILLED); Map<TaskId,Task> tasks = job.getTasks(); Assert.assertEquals("No of tasks is not correct", 1, tasks.size()); Task task = tasks.values().iterator().next(); Assert.assertEquals("Task state not correct", TaskState.KILLED, task.getReport().getTaskState()); Map<TaskAttemptId, TaskAttempt> attempts = tasks.values().iterator().next().getAttempts(); Assert.assertEquals("No of attempts is not correct", 1, attempts.size()); Iterator<TaskAttempt> it = attempts.values().iterator(); Assert.assertEquals("Attempt state not correct", TaskAttemptState.KILLED, it.next().getReport().getTaskAttemptState()); }
@Test public void testCommitPending() throws Exception { MRApp app = new MRApp(1, 0, false, this.getClass().getName(), true); Job job = app.submit(new Configuration()); app.waitForState(job, JobState.RUNNING); Assert.assertEquals("Num tasks not correct", 1, job.getTasks().size()); Iterator<Task> it = job.getTasks().values().iterator(); Task task = it.next(); app.waitForState(task, TaskState.RUNNING); TaskAttempt attempt = task.getAttempts().values().iterator().next(); app.waitForState(attempt, TaskAttemptState.RUNNING); //send the commit pending signal to the task app.getContext().getEventHandler().handle( new TaskAttemptEvent( attempt.getID(), TaskAttemptEventType.TA_COMMIT_PENDING)); //wait for first attempt to commit pending app.waitForState(attempt, TaskAttemptState.COMMIT_PENDING); //re-send the commit pending signal to the task app.getContext().getEventHandler().handle( new TaskAttemptEvent( attempt.getID(), TaskAttemptEventType.TA_COMMIT_PENDING)); //the task attempt should be still at COMMIT_PENDING app.waitForState(attempt, TaskAttemptState.COMMIT_PENDING); //send the done signal to the task app.getContext().getEventHandler().handle( new TaskAttemptEvent( task.getAttempts().values().iterator().next().getID(), TaskAttemptEventType.TA_DONE)); app.waitForState(job, JobState.SUCCEEDED); }
@Test public void testTaskProgress() { LOG.info("--- START: testTaskProgress ---"); mockTask = createMockTask(TaskType.MAP); // launch task TaskId taskId = getNewTaskID(); scheduleTaskAttempt(taskId); float progress = 0f; assert(mockTask.getProgress() == progress); launchTaskAttempt(getLastAttempt().getAttemptId()); // update attempt1 progress = 50f; updateLastAttemptProgress(progress); assert(mockTask.getProgress() == progress); progress = 100f; updateLastAttemptProgress(progress); assert(mockTask.getProgress() == progress); progress = 0f; // mark first attempt as killed updateLastAttemptState(TaskAttemptState.KILLED); assert(mockTask.getProgress() == progress); // kill first attempt // should trigger a new attempt // as no successful attempts killRunningTaskAttempt(getLastAttempt().getAttemptId()); assert(taskAttempts.size() == 2); assert(mockTask.getProgress() == 0f); launchTaskAttempt(getLastAttempt().getAttemptId()); progress = 50f; updateLastAttemptProgress(progress); assert(mockTask.getProgress() == progress); }
@Test public void testFailureDuringTaskAttemptCommit() { mockTask = createMockTask(TaskType.MAP); TaskId taskId = getNewTaskID(); scheduleTaskAttempt(taskId); launchTaskAttempt(getLastAttempt().getAttemptId()); updateLastAttemptState(TaskAttemptState.COMMIT_PENDING); commitTaskAttempt(getLastAttempt().getAttemptId()); // During the task attempt commit there is an exception which causes // the attempt to fail updateLastAttemptState(TaskAttemptState.FAILED); failRunningTaskAttempt(getLastAttempt().getAttemptId()); assertEquals(2, taskAttempts.size()); updateLastAttemptState(TaskAttemptState.SUCCEEDED); commitTaskAttempt(getLastAttempt().getAttemptId()); mockTask.handle(new TaskTAttemptEvent(getLastAttempt().getAttemptId(), TaskEventType.T_ATTEMPT_SUCCEEDED)); assertFalse("First attempt should not commit", mockTask.canCommit(taskAttempts.get(0).getAttemptId())); assertTrue("Second attempt should commit", mockTask.canCommit(getLastAttempt().getAttemptId())); assertTaskSucceededState(); }
private void runSpeculativeTaskAttemptSucceeds( TaskEventType firstAttemptFinishEvent) { TaskId taskId = getNewTaskID(); scheduleTaskAttempt(taskId); launchTaskAttempt(getLastAttempt().getAttemptId()); updateLastAttemptState(TaskAttemptState.RUNNING); // Add a speculative task attempt that succeeds mockTask.handle(new TaskTAttemptEvent(getLastAttempt().getAttemptId(), TaskEventType.T_ADD_SPEC_ATTEMPT)); launchTaskAttempt(getLastAttempt().getAttemptId()); commitTaskAttempt(getLastAttempt().getAttemptId()); mockTask.handle(new TaskTAttemptEvent(getLastAttempt().getAttemptId(), TaskEventType.T_ATTEMPT_SUCCEEDED)); // The task should now have succeeded assertTaskSucceededState(); // Now complete the first task attempt, after the second has succeeded mockTask.handle(new TaskTAttemptEvent(taskAttempts.get(0).getAttemptId(), firstAttemptFinishEvent)); // The task should still be in the succeeded state assertTaskSucceededState(); // The task should contain speculative a task attempt assertTaskAttemptAvataar(Avataar.SPECULATIVE); }
private TaskAttemptStatus createTaskAttemptStatus(TaskAttemptId id, float progress, TaskAttemptState state) { TaskAttemptStatus status = new TaskAttemptStatus(); status.id = id; status.progress = progress; status.taskState = state; return status; }
@Override public TaskAttemptState getTaskAttemptState() { TaskAttemptReportProtoOrBuilder p = viaProto ? proto : builder; if (!p.hasTaskAttemptState()) { return null; } return convertFromProtoFormat(p.getTaskAttemptState()); }
@Override public void setTaskAttemptState(TaskAttemptState taskAttemptState) { maybeInitBuilder(); if (taskAttemptState == null) { builder.clearTaskAttemptState(); return; } builder.setTaskAttemptState(convertToProtoFormat(taskAttemptState)); }
CompletedTaskAttempt(TaskId taskId, TaskAttemptInfo attemptInfo) { this.attemptInfo = attemptInfo; this.attemptId = TypeConverter.toYarn(attemptInfo.getAttemptId()); if (attemptInfo.getTaskStatus() != null) { this.state = TaskAttemptState.valueOf(attemptInfo.getTaskStatus()); } else { this.state = TaskAttemptState.KILLED; localDiagMessage = "Attmpt state missing from History : marked as KILLED"; diagnostics.add(localDiagMessage); } if (attemptInfo.getError() != null) { diagnostics.add(attemptInfo.getError()); } }
private void verifyAttempt(TaskAttempt attempt) { Assert.assertEquals("TaskAttempt state not currect", TaskAttemptState.SUCCEEDED, attempt.getState()); Assert.assertNotNull(attempt.getAssignedContainerID()); //Verify the wrong ctor is not being used. Remove after mrv1 is removed. ContainerId fakeCid = MRApp.newContainerId(-1, -1, -1, -1); Assert.assertFalse(attempt.getAssignedContainerID().equals(fakeCid)); //Verify complete contianerManagerAddress Assert.assertEquals(MRApp.NM_HOST + ":" + MRApp.NM_PORT, attempt.getAssignedContainerMgrAddress()); }
@Test (timeout=10000) public void testCompletedTaskAttempt() throws Exception { HistoryFileInfo info = mock(HistoryFileInfo.class); when(info.getConfFile()).thenReturn(fullConfPath); completedJob = new CompletedJob(conf, jobId, fullHistoryPath, loadTasks, "user", info, jobAclsManager); TaskId mt1Id = MRBuilderUtils.newTaskId(jobId, 0, TaskType.MAP); TaskId rt1Id = MRBuilderUtils.newTaskId(jobId, 0, TaskType.REDUCE); TaskAttemptId mta1Id = MRBuilderUtils.newTaskAttemptId(mt1Id, 0); TaskAttemptId rta1Id = MRBuilderUtils.newTaskAttemptId(rt1Id, 0); Task mt1 = completedJob.getTask(mt1Id); Task rt1 = completedJob.getTask(rt1Id); TaskAttempt mta1 = mt1.getAttempt(mta1Id); assertEquals(TaskAttemptState.SUCCEEDED, mta1.getState()); assertEquals("localhost:45454", mta1.getAssignedContainerMgrAddress()); assertEquals("localhost:9999", mta1.getNodeHttpAddress()); TaskAttemptReport mta1Report = mta1.getReport(); assertEquals(TaskAttemptState.SUCCEEDED, mta1Report.getTaskAttemptState()); assertEquals("localhost", mta1Report.getNodeManagerHost()); assertEquals(45454, mta1Report.getNodeManagerPort()); assertEquals(9999, mta1Report.getNodeManagerHttpPort()); TaskAttempt rta1 = rt1.getAttempt(rta1Id); assertEquals(TaskAttemptState.SUCCEEDED, rta1.getState()); assertEquals("localhost:45454", rta1.getAssignedContainerMgrAddress()); assertEquals("localhost:9999", rta1.getNodeHttpAddress()); TaskAttemptReport rta1Report = rta1.getReport(); assertEquals(TaskAttemptState.SUCCEEDED, rta1Report.getTaskAttemptState()); assertEquals("localhost", rta1Report.getNodeManagerHost()); assertEquals(45454, rta1Report.getNodeManagerPort()); assertEquals(9999, rta1Report.getNodeManagerHttpPort()); }
protected static TaskAttemptState getExternalState( TaskAttemptStateInternal smState) { switch (smState) { case ASSIGNED: case UNASSIGNED: return TaskAttemptState.STARTING; case COMMIT_PENDING: return TaskAttemptState.COMMIT_PENDING; case FAIL_CONTAINER_CLEANUP: case FAIL_TASK_CLEANUP: case FAIL_FINISHING_CONTAINER: case FAILED: return TaskAttemptState.FAILED; case KILL_CONTAINER_CLEANUP: case KILL_TASK_CLEANUP: case KILLED: return TaskAttemptState.KILLED; case RUNNING: return TaskAttemptState.RUNNING; case NEW: return TaskAttemptState.NEW; case SUCCESS_CONTAINER_CLEANUP: case SUCCESS_FINISHING_CONTAINER: case SUCCEEDED: return TaskAttemptState.SUCCEEDED; default: throw new YarnRuntimeException("Attempt to convert invalid " + "stateMachineTaskAttemptState to externalTaskAttemptState: " + smState); } }