MR在初始化阶段会把任务创建好,待TT心跳时会把任务发送给TT,这里涉及到如何分配任务的问题,例如该节点是否适合分配任务、分配map还是reduce、还要保留多少事物槽用于推测执行,这些都要通过调度器的计算才能合理分配任务,在hadoop中这个调度器是一个单独的组件TaskScheduler,这是一个抽象类,继承这个类就可以实现自己的调度,hadoop默认的调度器是JobQueueTaskScheduler,每次心跳时他便会通过assignTasks构造任务,封装在返回结果中。需要注意的是,这个类计算集群或某个TT负载大部分情况下是根据task的数量来计算的,并不是这个TT通常意义上的负载(例如一个TT上至跑了一个task,导致整个系统的load很高,这个load是OS层面的,在TaskScheduler看来可能并不高,因为他只跑了一个task),显然这个粒度是比较粗的。下面看下hadoop是如何分配任务的。
public synchronized List<Task> assignTasks(TaskTracker taskTracker)
throws IOException {
//获得一个TT状态,该TT在心跳时传过来,代表发送心跳的那个TT
TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus();
//获得整个集群的状态,包括有效TT,黑名单、灰名单信息
ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus();
//有效TT的数量
final int numTaskTrackers = clusterStatus.getTaskTrackers();
//获得集群map的计算能力(最多能跑多少map)
final int clusterMapCapacity = clusterStatus.getMaxMapTasks();
//同上,这个是针对reduce的
final int clusterReduceCapacity = clusterStatus.getMaxReduceTasks();
//获得作业队列,在上篇初始化时我们可以看到,他已经被放入了作业队列中了
Collection<JobInProgress> jobQueue =
jobQueueJobInProgressListener.getJobQueue();
//获得当前TT的计算能力,最多能运行多少map和reduce,当前正在运行的多少
final int trackerMapCapacity = taskTrackerStatus.getMaxMapSlots();
final int trackerReduceCapacity = taskTrackerStatus.getMaxReduceSlots();
final int trackerRunningMaps = taskTrackerStatus.countMapTasks();
final int trackerRunningReduces = taskTrackerStatus.countReduceTasks();
// 创建一个任务集合,用于返回
List<Task> assignedTasks = new ArrayList<Task>();
//
// Compute (running + pending) map and reduce task numbers across pool
//
int remainingReduceLoad = 0;
int remainingMapLoad = 0;
synchronized (jobQueue) {
for (JobInProgress job : jobQueue) {//遍历所有作业
if (job.getStatus().getRunState() == JobStatus.RUNNING) {
//计算该作业的剩余负载
remainingMapLoad += (job.desiredMaps() - job.finishedMaps());
if (job.scheduleReduces()) {//判断是否需要启动reduce
remainingReduceLoad += //计算剩余reduce负载
(job.desiredReduces() - job.finishedReduces());
}
}
}
}
// 计算map的负载比例
double mapLoadFactor = 0.0;
if (clusterMapCapacity > 0) {
mapLoadFactor = (double)remainingMapLoad / clusterMapCapacity;
}
//计算reduce负载比例
double reduceLoadFactor = 0.0;
if (clusterReduceCapacity > 0) {
reduceLoadFactor = (double)remainingReduceLoad / clusterReduceCapacity;
}
//
// In the below steps, we allocate first map tasks (if appropriate),
// and then reduce tasks if appropriate. We go through all jobs
// in order of job arrival; jobs only get serviced if their
// predecessors are serviced, too.
//
//
// We assign tasks to the current taskTracker if the given machine
// has a workload that's less than the maximum load of that kind of
// task.
// However, if the cluster is close to getting loaded i.e. we don't
// have enough _padding_ for speculative executions etc., we only
// schedule the "highest priority" task i.e. the task from the job
// with the highest priority.
//
//获得当前TT能执行map总数
final int trackerCurrentMapCapacity =
Math.min((int)Math.ceil(mapLoadFactor * trackerMapCapacity),
trackerMapCapacity);
//获得有效事物槽
int availableMapSlots = trackerCurrentMapCapacity - trackerRunningMaps;
boolean exceededMapPadding = false;
if (availableMapSlots > 0) {
//判断是否需要保留负载
exceededMapPadding =
exceededPadding(true, clusterStatus, trackerMapCapacity);
}
int numLocalMaps = 0;
int numNonLocalMaps = 0;
scheduleMaps:
for (int i=0; i < availableMapSlots; ++i) {//根据有效事物槽分配任务
synchronized (jobQueue) {
for (JobInProgress job : jobQueue) {//只处理运行任务
if (job.getStatus().getRunState() != JobStatus.RUNNING) {
continue;
}
Task t = null;
// 构建一个本地map任务
t =
job.obtainNewNodeOrRackLocalMapTask(taskTrackerStatus,
numTaskTrackers, taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
assignedTasks.add(t);//加入返回集合
++numLocalMaps;
// Don't assign map tasks to the hilt!
// Leave some free slots in the cluster for future task-failures,
// speculative tasks etc. beyond the highest priority job
if (exceededMapPadding) {//超出保留负载就不再分配负载了
break scheduleMaps;
}
// Try all jobs again for the next Map task
break;
}
// 创建一非本地任务
t =
job.obtainNewNonLocalMapTask(taskTrackerStatus, numTaskTrackers,
taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
assignedTasks.add(t);
++numNonLocalMaps;
// We assign at most 1 off-switch or speculative task
// This is to prevent TaskTrackers from stealing local-tasks
// from other TaskTrackers.
break scheduleMaps;
}
}
}
}
int assignedMaps = assignedTasks.size();
//reduce创建,和map逻辑大致相同
final int trackerCurrentReduceCapacity =
Math.min((int)Math.ceil(reduceLoadFactor * trackerReduceCapacity),
trackerReduceCapacity);
final int availableReduceSlots =
Math.min((trackerCurrentReduceCapacity - trackerRunningReduces), 1);
boolean exceededReducePadding = false;
if (availableReduceSlots > 0) {
exceededReducePadding = exceededPadding(false, clusterStatus,
trackerReduceCapacity);
synchronized (jobQueue) {
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING ||
job.numReduceTasks == 0) {
continue;
}
Task t =
job.obtainNewReduceTask(taskTrackerStatus, numTaskTrackers,
taskTrackerManager.getNumberOfUniqueHosts()
);
if (t != null) {
assignedTasks.add(t);
break;
}
// Don't assign reduce tasks to the hilt!
// Leave some free slots in the cluster for future task-failures,
// speculative tasks etc. beyond the highest priority job
if (exceededReducePadding) {
break;
}
}
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Task assignments for " + taskTrackerStatus.getTrackerName() + " --> " +
"[" + mapLoadFactor + ", " + trackerMapCapacity + ", " +
trackerCurrentMapCapacity + ", " + trackerRunningMaps + "] -> [" +
(trackerCurrentMapCapacity - trackerRunningMaps) + ", " +
assignedMaps + " (" + numLocalMaps + ", " + numNonLocalMaps +
")] [" + reduceLoadFactor + ", " + trackerReduceCapacity + ", " +
trackerCurrentReduceCapacity + "," + trackerRunningReduces +
"] -> [" + (trackerCurrentReduceCapacity - trackerRunningReduces) +
", " + (assignedTasks.size()-assignedMaps) + "]");
}
return assignedTasks;
}
系统预留负载:高优先级作业、推测执行任务,计算过程如下: private boolean exceededPadding(boolean isMapTask,
ClusterStatus clusterStatus,
int maxTaskTrackerSlots) {
//获得当前有效TT
int numTaskTrackers = clusterStatus.getTaskTrackers();
//当前集群中正在执行的任务
int totalTasks =
(isMapTask) ? clusterStatus.getMapTasks() :
clusterStatus.getReduceTasks();
//当前集群中能够执行的任务数
int totalTaskCapacity =
isMapTask ? clusterStatus.getMaxMapTasks() :
clusterStatus.getMaxReduceTasks();
//获得作业队列
Collection<JobInProgress> jobQueue =
jobQueueJobInProgressListener.getJobQueue();
//判断所有正在运行作业
boolean exceededPadding = false;
synchronized (jobQueue) {
int totalNeededTasks = 0;
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING ||
job.numReduceTasks == 0) {
continue;
}
//
// Beyond the highest-priority task, reserve a little
// room for failures and speculative executions; don't
// schedule tasks to the hilt.
//
totalNeededTasks +=
isMapTask ? job.desiredMaps() : job.desiredReduces();
int padding = 0;
if (numTaskTrackers > MIN_CLUSTER_SIZE_FOR_PADDING) {
//计算保留负载,因为要应对一些高优先级作业和推测执行
padding =
Math.min(maxTaskTrackerSlots,
(int) (totalNeededTasks * padFraction));
}
//如果正在运行的任务加上预留任务槽大于整个集群的负载能力,则超出预留属性为真
if (totalTasks + padding >= totalTaskCapacity) {
exceededPadding = true;
break;
}
}
}
return exceededPadding;
}
setup和cleanup任务是不受调度器影响的,由JT自己分配,源码部分在JT的heartbeat中,如下:
if (recoveryManager.shouldSchedule() && acceptNewTasks && !isBlacklisted) {
TaskTrackerStatus taskTrackerStatus = getTaskTrackerStatus(trackerName);
if (taskTrackerStatus == null) {
LOG.warn("Unknown task tracker polling; ignoring: " + trackerName);
} else {
//分配setup cleanup任务
List<Task> tasks = getSetupAndCleanupTasks(taskTrackerStatus);
if (tasks == null ) {
//进入任务调度器获得一个任务
tasks = taskScheduler.assignTasks(taskTrackers.get(trackerName));
}
if (tasks != null) {
for (Task task : tasks) {
expireLaunchingTasks.addNewTask(task.getTaskID());
if(LOG.isDebugEnabled()) {
LOG.debug(trackerName + " -> LaunchTask: " + task.getTaskID());
}
actions.add(new LaunchTaskAction(task));
}
}
}
}