private static void sendHeartBeat(TaskTrackerHealthStatus status, boolean initialContact) throws IOException { for (String tracker : trackers) { TaskTrackerStatus tts = new TaskTrackerStatus(tracker, JobInProgress .convertTrackerNameToHostName(tracker)); if (status != null) { TaskTrackerHealthStatus healthStatus = tts.getHealthStatus(); healthStatus.setNodeHealthy(status.isNodeHealthy()); healthStatus.setHealthReport(status.getHealthReport()); healthStatus.setLastReported(status.getLastReported()); } jobTracker.heartbeat(tts, false, initialContact, false, responseId); } responseId++; }
public void testNodeHealthBlackListing() throws Exception { TaskTrackerHealthStatus status = getUnhealthyNodeStatus("ERROR"); //Blacklist tracker due to node health failures. sendHeartBeat(status, false); for (String host : hosts) { checkReasonForBlackListing(host, nodeUnHealthyReasonSet); } status.setNodeHealthy(true); status.setLastReported(System.currentTimeMillis()); status.setHealthReport(""); //white list tracker so the further test cases can be //using trackers. sendHeartBeat(status, false); assertEquals("Trackers still blacklisted after healthy report", 0, jobTracker.getBlacklistedTrackerCount()); }
private void updateNodeHealthStatus(TaskTrackerStatus trackerStatus) { TaskTrackerHealthStatus status = trackerStatus.getHealthStatus(); synchronized (faultyTrackers) { faultyTrackers.setNodeHealthStatus(trackerStatus.getHost(), status.isNodeHealthy(), status.getHealthReport()); } }
@Override public boolean isHealthy() { boolean healthy = true; TaskTrackerHealthStatus hs = new TaskTrackerHealthStatus(); if (healthChecker != null) { healthChecker.setHealthStatus(hs); healthy = hs.isNodeHealthy(); } return healthy; }
private void updateNodeHealthStatus(TaskTrackerStatus trackerStatus, long timeStamp) { TaskTrackerHealthStatus status = trackerStatus.getHealthStatus(); synchronized (faultyTrackers) { faultyTrackers.setNodeHealthStatus(trackerStatus.getHost(), status.isNodeHealthy(), status.getHealthReport(), timeStamp); } }
public void testBlackListingWithFailuresAndHealthStatus() throws Exception { runBlackListingJob(jobTracker, trackers); assertEquals("Tracker 1 not blacklisted", 1, jobTracker.getBlacklistedTrackerCount()); checkReasonForBlackListing(hosts[0], exceedsFailuresReasonSet); TaskTrackerHealthStatus status = getUnhealthyNodeStatus("ERROR"); sendHeartBeat(status, false); assertEquals("All trackers not blacklisted", 3, jobTracker.getBlacklistedTrackerCount()); checkReasonForBlackListing(hosts[0], unhealthyAndExceedsFailure); checkReasonForBlackListing(hosts[1], nodeUnHealthyReasonSet); checkReasonForBlackListing(hosts[2], nodeUnHealthyReasonSet); clock.jumpADay = true; sendHeartBeat(status, false); assertEquals("All trackers not blacklisted", 3, jobTracker.getBlacklistedTrackerCount()); for (String host : hosts) { checkReasonForBlackListing(host, nodeUnHealthyReasonSet); } //clear blacklisted trackers due to node health reasons. sendHeartBeat(null, false); assertEquals("All trackers not white listed", 0, jobTracker.getBlacklistedTrackerCount()); //Clear the blacklisted trackers due to failures. clock.jumpADay = false; }
public void testBlacklistingReasonString() throws Exception { String error = "ERROR"; String error1 = "ERROR1"; TaskTrackerHealthStatus status = getUnhealthyNodeStatus(error); sendHeartBeat(status, false); assertEquals("All trackers not blacklisted", 3, jobTracker.getBlacklistedTrackerCount()); checkReasonForBlackListing(hosts[0], nodeUnHealthyReasonSet); checkReasonForBlackListing(hosts[1], nodeUnHealthyReasonSet); checkReasonForBlackListing(hosts[2], nodeUnHealthyReasonSet); for (int i = 0; i < hosts.length; i++) { //Replace new line as we are adding new line //in getFaultReport assertEquals("Blacklisting reason string not correct for host " + i, error, jobTracker.getFaultReport(hosts[i]).replace("\n", "")); } status.setNodeHealthy(false); status.setLastReported(System.currentTimeMillis()); status.setHealthReport(error1); sendHeartBeat(status, false); checkReasonForBlackListing(hosts[0], nodeUnHealthyReasonSet); checkReasonForBlackListing(hosts[1], nodeUnHealthyReasonSet); checkReasonForBlackListing(hosts[2], nodeUnHealthyReasonSet); for (int i = 0; i < hosts.length; i++) { //Replace new line as we are adding new line //in getFaultReport assertEquals("Blacklisting reason string not correct for host " + i, error1, jobTracker.getFaultReport(hosts[i]).replace("\n", "")); } //clear the blacklisted trackers with node health reasons. sendHeartBeat(null, false); }
private TaskTrackerHealthStatus getUnhealthyNodeStatus(String error) { TaskTrackerHealthStatus status = new TaskTrackerHealthStatus(); status.setNodeHealthy(false); status.setLastReported(System.currentTimeMillis()); status.setHealthReport(error); return status; }
/** * Method to populate the fields for the {@link TaskTrackerHealthStatus} * * @param healthStatus */ synchronized void setHealthStatus(TaskTrackerHealthStatus healthStatus) { healthStatus.setNodeHealthy(this.isHealthy()); healthStatus.setHealthReport(this.getHealthReport()); healthStatus.setLastReported(this.getLastReportedTime()); }
public void testNodeHealthScript() throws Exception { TaskTrackerHealthStatus healthStatus = new TaskTrackerHealthStatus(); String errorScript = "echo ERROR\n echo \"Tracker not healthy\""; String normalScript = "echo \"I am all fine\""; String timeOutScript = "sleep 4\n echo\"I am fine\""; Configuration conf = getConfForNodeHealthScript(); conf.writeXml(new FileOutputStream(nodeHealthConfigFile)); NodeHealthCheckerService nodeHealthChecker = new NodeHealthCheckerService( conf); TimerTask timer = nodeHealthChecker.getTimer(); writeNodeHealthScriptFile(normalScript, true); timer.run(); nodeHealthChecker.setHealthStatus(healthStatus); LOG.info("Checking initial healthy condition"); // Check proper report conditions. assertTrue("Node health status reported unhealthy", healthStatus .isNodeHealthy()); assertTrue("Node health status reported unhealthy", healthStatus .getHealthReport().isEmpty()); // write out error file. // Healthy to unhealthy transition writeNodeHealthScriptFile(errorScript, true); // Run timer timer.run(); // update health status nodeHealthChecker.setHealthStatus(healthStatus); LOG.info("Checking Healthy--->Unhealthy"); assertFalse("Node health status reported healthy", healthStatus .isNodeHealthy()); assertFalse("Node health status reported healthy", healthStatus .getHealthReport().isEmpty()); // Check unhealthy to healthy transitions. writeNodeHealthScriptFile(normalScript, true); timer.run(); nodeHealthChecker.setHealthStatus(healthStatus); LOG.info("Checking UnHealthy--->healthy"); // Check proper report conditions. assertTrue("Node health status reported unhealthy", healthStatus .isNodeHealthy()); assertTrue("Node health status reported unhealthy", healthStatus .getHealthReport().isEmpty()); // Healthy to timeout transition. writeNodeHealthScriptFile(timeOutScript, true); timer.run(); nodeHealthChecker.setHealthStatus(healthStatus); LOG.info("Checking Healthy--->timeout"); assertFalse("Node health status reported healthy even after timeout", healthStatus.isNodeHealthy()); assertEquals("Node time out message not propogated", healthStatus .getHealthReport(), NodeHealthCheckerService.NODE_HEALTH_SCRIPT_TIMED_OUT_MSG); }
/** * Build and transmit the heart beat to the JobTracker * @param jobClient The jobTracker RPC handle * @param heartbeatResponseId Last heartbeat response received * @param status TaskTrackerStatus to transmit * @return false if the tracker was unknown * @throws IOException */ protected HeartbeatResponse transmitHeartBeat( InterTrackerProtocol jobClient, short heartbeatResponseId, TaskTrackerStatus status) throws IOException { // // Check if we should ask for a new Task // boolean askForNewTask; long localMinSpaceStart; synchronized (this) { askForNewTask = ((status.countOccupiedMapSlots() < maxMapSlots || status.countOccupiedReduceSlots() < maxReduceSlots) && acceptNewTasks); localMinSpaceStart = minSpaceStart; } if (askForNewTask) { checkLocalDirs(fConf.getLocalDirs()); askForNewTask = enoughFreeSpace(localMinSpaceStart); gatherResourceStatus(status); } //add node health information TaskTrackerHealthStatus healthStatus = status.getHealthStatus(); synchronized (this) { if (healthChecker != null) { healthChecker.setHealthStatus(healthStatus); } else { healthStatus.setNodeHealthy(true); healthStatus.setLastReported(0L); healthStatus.setHealthReport(""); } } // // Xmit the heartbeat // HeartbeatResponse heartbeatResponse = jobClient.heartbeat(status, justStarted, justInited, askForNewTask, heartbeatResponseId); synchronized (this) { for (TaskStatus taskStatus : status.getTaskReports()) { if (taskStatus.getRunState() != TaskStatus.State.RUNNING && taskStatus.getRunState() != TaskStatus.State.UNASSIGNED && taskStatus.getRunState() != TaskStatus.State.COMMIT_PENDING && !taskStatus.inTaskCleanupPhase()) { if (taskStatus.getIsMap()) { mapTotal--; } else { reduceTotal--; } try { myInstrumentation.completeTask(taskStatus.getTaskID()); } catch (MetricsException me) { LOG.warn("Caught: " + StringUtils.stringifyException(me)); } removeRunningTask(taskStatus.getTaskID()); } } // Clear transient status information which should only // be sent once to the JobTracker for (TaskInProgress tip: runningTasks.values()) { tip.getStatus().clearStatus(); } } return heartbeatResponse; }
public void testBlackListingWithTrackerReservation() throws Exception { JobConf conf = new JobConf(); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); TaskTracker tt1 = jobTracker.getTaskTracker(trackers[0]); TaskTracker tt2 = jobTracker.getTaskTracker(trackers[1]); tt1.reserveSlots(TaskType.MAP, job, 1); tt1.reserveSlots(TaskType.REDUCE, job, 1); tt2.reserveSlots(TaskType.MAP, job, 1); tt2.reserveSlots(TaskType.REDUCE, job, 1); assertEquals("Tracker 1 not reserved for the job 1", 2, job .getNumReservedTaskTrackersForMaps()); assertEquals("Tracker 1 not reserved for the job 1", 2, job .getNumReservedTaskTrackersForReduces()); runBlackListingJob(jobTracker, trackers); assertEquals("Tracker 1 not unreserved for the job 1", 1, job .getNumReservedTaskTrackersForMaps()); assertEquals("Tracker 1 not unreserved for the job 1", 1, job .getNumReservedTaskTrackersForReduces()); assertEquals("Tracker 1 not blacklisted", 1, jobTracker .getBlacklistedTrackerCount()); checkReasonForBlackListing(hosts[0], exceedsFailuresReasonSet); TaskTrackerHealthStatus status = getUnhealthyNodeStatus("ERROR"); sendHeartBeat(status, false); assertEquals("All trackers not blacklisted", 3, jobTracker.getBlacklistedTrackerCount()); checkReasonForBlackListing(hosts[0], unhealthyAndExceedsFailure); checkReasonForBlackListing(hosts[1], nodeUnHealthyReasonSet); checkReasonForBlackListing(hosts[2], nodeUnHealthyReasonSet); assertEquals("Tracker 1 not unreserved for the job 1", 0, job .getNumReservedTaskTrackersForMaps()); assertEquals("Tracker 1 not unreserved for the job 1", 0, job .getNumReservedTaskTrackersForReduces()); //white list all trackers for health reasons and failure counts clock.jumpADay = true; sendHeartBeat(null, false); }