一个完整的Hadoop MapReduce过程可以描述如下:
- Client端提交MapReduce Job到JobTracker;
- JobTracker调度Job, 生成MapTask和ReduceTask;
- 各TaskTracker接收MapTask和ReduceTask;
- TaskTracker为MapTask和ReduceTask启动新的Child Task JVM;
- Child Task JVM 运行MapTask或ReduceTask。
- Child Task JVM 通过TaskTracker向JobTracker汇报进度和状态。
- 当JobTacker下所有的Task都成功时,Job标志为成功状态。
JobClient 提交 MapReduce Job
JobClient.runJob()方法一旦调用,MapReduce的大象就起跑了。
钻入Job.submit()方法,找到一个有货的方法JobSubmitter.submitJobInternal()。
JobStatus submitJobInternal(Job job, Cluster cluster) throws ClassNotFoundException, InterruptedException, IOException { //检查job的规范 checkSpecs(job); Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, job.getConfiguration()); //configure the command line options correctly class="java"> private <T extends InputSplit> int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = job.getConfiguration(); InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); //调用InputFormat的Split方法,产生InputSplit List<InputSplit> splits = input.getSplits(job); T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]); // sort the splits into order based class="java"> public void offerService() throws InterruptedException, IOException { // Prepare for recovery. This is done irrespective of the status of restart // flag. while (true) { try { recoveryManager.updateRestartCount(); break; } catch (IOException ioe) { LOG.warn("Failed to initialize recovery manager. ", ioe); // wait for some time Thread.sleep(FS_ACCESS_RETRY_PERIOD); LOG.warn("Retrying..."); } } taskScheduler.start(); recoveryManager.recover(); // refresh the node list as the recovery manager might have added // disallowed trackers refreshHosts(); startExpireTrackersThread(); expireLaunchingTaskThread.start(); if (completedJobStatusStore.isActive()) { completedJobsStoreThread = new Thread(completedJobStatusStore, "completedjobsStore-housekeeper"); completedJobsStoreThread.start(); } // start the inter-tracker server class="java"> private JobStatus submitJob(org.apache.hadoop.mapreduce.JobID jobID, int restartCount, UserGroupInformation ugi, String jobSubmitDir, boolean recovered, Credentials ts ) throws IOException, InterruptedException { ... // Create the JobInProgress, temporarily unlock the JobTracker since // we are about to copy job.xml from HDFSJobInProgress JobInProgress job = new JobInProgress(this, this.conf, restartCount, jobInfo, ts); synchronized (this) { ... return addJob(jobId, job); } }
JobInitManager将Job分解为Tasks,并加入队列
class JobInitManager implements Runnable { public void run() { JobInProgress job = null; while (true) { try { synchronized (jobInitQueue) { while (jobInitQueue.isEmpty()) { jobInitQueue.wait(); } job = jobInitQueue.remove(0); } threadPool.execute(new InitJob(job)); } catch (InterruptedException t) { LOG.info("JobInitManagerThread interrupted."); break; } } LOG.info("Shutting down thread pool"); threadPool.shutdownNow(); } }
调用JobInProcess.initTasks()函数,为MapTask和ReduceTask生成多个TasksInProgress对象
public synchronized void initTasks() throws IOException, KillInterruptedException, UnknownHostException { ... createMapTasks(jobFile.toString(), taskSplitMetaInfo); ... // set the launch time this.launchTime = JobTracker.getClock().getTime(); createReduceTasks(jobFile.toString()); ... }
JobTracker给TaskTracker分配任务
JobTracker在heatBeat()方法中,调用JobQueueTaskScheduler.assignTasks(TaskTracker taskTracker)函数,并将Task包含在HeartbeatResponse里返回。
public synchronized HeartbeatResponse heartbeat(TaskTrackerStatus status,
boolean restarted,
boolean initialContact,
boolean acceptNewTasks,
short responseId)
throws IOException {
...
// Process this heartbeat
short newResponseId = (short)(responseId + 1);
status.setLastSeen(now);
if (!processHeartbeat(status, initialContact)) {
if (prevHeartbeatResponse != null) {
trackerToHeartbeatResponseMap.remove(trackerName);
}
return new HeartbeatResponse(newResponseId,
new TaskTrackerAction[] {new ReinitTrackerAction()});
}
// Initialize the response to be sent for the heartbeat
HeartbeatResponse response = new HeartbeatResponse(newResponseId, null);
List<TaskTrackerAction> actions = new ArrayList<TaskTrackerAction>();
isBlacklisted = faultyTrackers.isBlacklisted(status.getHost());
// Check for new tasks to be executed class="java"> /**
* The server retry loop.
* This while-loop attempts to connect to the JobTracker. It class="java">State offerService() throws Exception {
long lastHeartbeat = 0;
while (running && !shuttingDown) {
try {
...
// Send the heartbeat and process the jobtracker's directives
HeartbeatResponse heartbeatResponse = transmitHeartBeat(now);
TaskTrackerAction[] actions = heartbeatResponse.getActions();
...
if (actions != null){
for(TaskTrackerAction action: actions) {
if (action instanceof LaunchTaskAction) {
addToTaskQueue((LaunchTaskAction)action);
} else if (action instanceof CommitTaskAction) {
CommitTaskAction commitAction = (CommitTaskAction)action;
if (!commitResponses.contains(commitAction.getTaskID())) {
LOG.info("Received commit task action for " +
commitAction.getTaskID());
commitResponses.add(commitAction.getTaskID());
}
} else {
tasksToCleanup.put(action);
}
}
}
markUnresponsiveTasks();
killOverflowingTasks();
//we've cleaned up, resume normal operation
if (!acceptNewTasks && isIdle()) {
acceptNewTasks=true;
}
...
}
}
return State.NORMAL;
}
TaskLauncher thread不断轮询tasksToLaunch队列,当有Slots 可用时,就调用launchTask(),着手启动Task了。
public void run() {
while (!Thread.interrupted()) {
try {
TaskInProgress tip;
Task task;
synchronized (tasksToLaunch) {
while (tasksToLaunch.isEmpty()) {
tasksToLaunch.wait();
}
//get the TIP
tip = tasksToLaunch.remove(0);
task = tip.getTask();
LOG.info("Trying to launch : " + tip.getTask().getTaskID() +
" which needs " + task.getNumSlotsRequired() + " slots");
}
//wait for free slots to run
synchronized (numFreeSlots) {
boolean canLaunch = true;
while (numFreeSlots.get() < task.getNumSlotsRequired()) {
//Make sure that there is no kill task action for this task!
//We are not locking tip here, because it would reverse the
//locking order!
//Also, Lock for the tip is not required here! because :
// 1. runState of TaskStatus is volatile
// 2. Any notification is not missed because notification is
// synchronized class="java"> try {
while (true) {
taskid = null;
JvmTask myTask = umbilical.getTask(context);
if (myTask.shouldDie()) {
break;
} else {
if (myTask.getTask() == null) {
taskid = null;
if (++idleLoopCount >= SLEEP_LONGER_COUNT) {
//we sleep for a bigger interval when we don't receive
//tasks for a while
Thread.sleep(1500);
} else {
Thread.sleep(500);
}
continue;
}
}
idleLoopCount = 0;
task = myTask.getTask();
...
final Task taskFinal = task;
childUGI.doAs(new PrivilegedExceptionAction<Object>() {
@Override
public Object run() throws Exception {
try {
// use job-specified working directory
FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory());
taskFinal.run(job, umbilical); // run the task
} finally {
TaskLog.syncLogs(logLocation, taskid, isCleanup);
}
return null;
}
});
...
}
}
}
}
本文描述的是MapReduce New API的部分
运行MapTask
Map阶段比较容易理解。
MapTask在run()方法中,调用runNewMapper()方法,
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException,
InterruptedException {
// make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
getTaskID(),
reporter);
// make a mapper
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
(org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
// make the input format
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
// rebuild the input split
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
new NewTrackingRecordReader<INKEY,INVALUE>
(split, inputFormat, reporter, taskContext);
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
org.apache.hadoop.mapreduce.RecordWriter output = null;
// get an output object
if (job.getNumReduceTasks() == 0) {
output =
new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else {
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}
org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE>
mapContext =
new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(),
input, output,
committer,
reporter, split);
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context
mapperContext =
new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
mapContext);
input.initialize(split, mapperContext);
mapper.run(mapperContext);
mapPhase.complete();
setPhase(TaskStatus.Phase.SORT);
statusUpdate(umbilical);
input.close();
output.close(mapperContext);
}
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException,
InterruptedException {
// make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
getTaskID(),
reporter);
// make a mapper
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
(org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
// make the input format
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
// rebuild the input split
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
new NewTrackingRecordReader<INKEY,INVALUE>
(split, inputFormat, reporter, taskContext);
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
org.apache.hadoop.mapreduce.RecordWriter output = null;
// get an output object
if (job.getNumReduceTasks() == 0) {
output =
new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else {
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}
org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE>
mapContext =
new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(),
input, output,
committer,
reporter, split);
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context
mapperContext =
new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
mapContext);
input.initialize(split, mapperContext);
mapper.run(mapperContext);
mapPhase.complete();
setPhase(TaskStatus.Phase.SORT);
statusUpdate(umbilical);
input.close();
output.close(mapperContext);
}
runNewMapper中调用用户提交的Mapper类
/** * Expert users can override this method for more complete control over the * execution of the Mapper. * @param context * @throws IOException */ public void run(Context context) throws IOException, InterruptedException { setup(context); while (context.nextKeyValue()) { map(context.getCurrentKey(), context.getCurrentValue(), context); } cleanup(context); }
Map结果到内存MapOutputBuffer
map的结果被context收集,默认的实现是收集在MapOutputBuffer中。MapOutputBuffer是通过循环数组来实现的。
/**
* Serialize the key, value to intermediate storage.
* When this method returns, kvindex must refer to sufficient unused
* storage to store class="java"> private void sortAndSpill() throws IOException, ClassNotFoundException,
InterruptedException {
//approximate the length of the output file to be the length of the
//buffer + header lengths for the partitions
final long size = (bufend >= bufstart
? bufend - bufstart
: (bufvoid - bufend) + bufstart) +
partitions * APPROX_HEADER_LENGTH;
FSDataOutputStream out = null;
try {
// create spill file
final SpillRecord spillRec = new SpillRecord(partitions);
final Path filename =
mapOutputFile.getSpillFileForWrite(numSpills, size);
out = rfs.create(filename);
final int mstart = kvend / NMETA;
final int mend = 1 + // kvend is a valid record
(kvstart >= kvend
? kvstart
: kvmeta.capacity() + kvstart) / NMETA;
sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
int spindex = mstart;
final IndexRecord rec = new IndexRecord();
final InMemValBytes value =
new
InMemValBytes();
for (int i = 0; i < partitions; ++i) {IFile.Writer writer = null;
try {
long segmentStart = out.getPos();
writer = new Writer(job, out, keyClass, valClass, codec,
spilledRecordsCounter);
if (combinerRunner == null) {
// spill directly
DataInputBuffer key = new DataInputBuffer();
while (spindex < mend &&
kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
final int kvoff = offsetFor(spindex % maxRec);
key.reset(kvbuffer, kvmeta.get(kvoff + KEYSTART),
(kvmeta.get(kvoff + VALSTART) -
kvmeta.get(kvoff + KEYSTART)));
getVBytesForOffset(kvoff, value);
writer.append(key, value);
++spindex;
}
} else {
int spstart = spindex;
while (spindex < mend &&
kvmeta.get(offsetFor(spindex % maxRec)
+ PARTITION) == i) {
++spindex;
}
// Note: we would like to avoid the combiner if we've fewer
// than some threshold of records for a partition
if (spstart != spindex) {
combineCollector.setWriter(writer);
RawKeyValueIterator kvIter =
new MRResultIterator(spstart, spindex);
combinerRunner.combine(kvIter, combineCollector);
}
}
// close the writer
writer.close();
// record offsets
rec.startOffset = segmentStart;
rec.rawLength = writer.getRawLength();
rec.partLength = writer.getCompressedLength();
spillRec.putIndex(rec, i);
writer = null;
} finally {
if (null != writer) writer.close();
}
}
if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
// create spill index file
Path indexFilename =
mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions
* MAP_OUTPUT_INDEX_RECORD_LENGTH);
spillRec.writeToFile(indexFilename, job);
} else {
indexCacheList.add(spillRec);
totalIndexCacheMemory +=
spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
}
LOG.info("Finished spill " + numSpills);
++numSpills;
} finally {
if (out != null) out.close();
}
}
运行ReduceTask
Reduce分为三个阶段,Copy, Sort, and Reduce。
Reduce的输入是一个KeyValueInterator,其通过Shuffle类的run方法产生的
public RawKeyValueIterator run() throws IOException, InterruptedException {
// Start the map-completion events fetcher thread
final EventFetcher<K,V> eventFetcher =
new EventFetcher<K,V>(reduceId, umbilical, scheduler, this);
eventFetcher.start();
// Start the map-output fetcher threads
final int numFetchers = jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5);
Fetcher<K,V>[] fetchers = new Fetcher[numFetchers];
for (int i=0; i < numFetchers; ++i) {
fetchers[i] = new Fetcher<K,V>(jobConf, reduceId, scheduler, merger,
reporter, metrics, this,
reduceTask.getJobTokenSecret());
fetchers[i].start();
}
// Wait for shuffle to complete successfully
while (!scheduler.waitUntilDone(PROGRESS_FREQUENCY)) {
reporter.progress();
synchronized (this) {
if (throwable != null) {
throw new ShuffleError("error in shuffle in " + throwingThreadName,
throwable);
}
}
}
// Stop the event-fetcher thread
eventFetcher.shutDown();
// Stop the map-output fetcher threads
for (Fetcher<K,V> fetcher : fetchers) {
fetcher.shutDown();
}
fetchers = null;
// stop the scheduler
scheduler.close();
copyPhase.complete(); // copy is already complete
taskStatus.setPhase(TaskStatus.Phase.SORT);
reduceTask.statusUpdate(umbilical);
// Finish the class="java"> public boolean next() throws IOException {
if (size() == 0)
return false;
if (minSegment != null) {
//minSegment is non-null for all invocations of next except the first
//one. For the first invocation, the priority queue is ready for use
//but for the subsequent invocations, first adjust the queue
adjustPriorityQueue(minSegment);
if (size() == 0) {
minSegment = null;
return false;
}
}
minSegment = top();
if (!minSegment.inMemory()) {
//When we load the value from an inmemory segment, we reset
//the "value" DIB in this class to the inmem segment's byte[].
//When we load the value bytes from disk, we shouldn't use
//the same byte[] since it would corrupt the data in the inmem
//segment. So we maintain an explicit DIB for value bytes
//obtained from disk, and if the current segment is a disk
//segment, we reset the "value" DIB to the byte[] in that (so
//we reuse the disk segment DIB whenever we consider
//a disk segment).
value.reset(diskIFileValue.getData(), diskIFileValue.getLength());
}
long startPos = minSegment.getPosition();
key = minSegment.getKey();
minSegment.getValue(value);
long endPos = minSegment.getPosition();
totalBytesProcessed += endPos - startPos;
mergeProgress.set(totalBytesProcessed * progPerByte);
return true;
}
// Start the map-completion events fetcher thread
final EventFetcher<K,V> eventFetcher =
new EventFetcher<K,V>(reduceId, umbilical, scheduler, this);
eventFetcher.start();
// Start the map-output fetcher threads
final int numFetchers = jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5);
Fetcher<K,V>[] fetchers = new Fetcher[numFetchers];
for (int i=0; i < numFetchers; ++i) {
fetchers[i] = new Fetcher<K,V>(jobConf, reduceId, scheduler, merger,
reporter, metrics, this,
reduceTask.getJobTokenSecret());
fetchers[i].start();
}
// Wait for shuffle to complete successfully
while (!scheduler.waitUntilDone(PROGRESS_FREQUENCY)) {
reporter.progress();
synchronized (this) {
if (throwable != null) {
throw new ShuffleError("error in shuffle in " + throwingThreadName,
throwable);
}
}
}
// Stop the event-fetcher thread
eventFetcher.shutDown();
// Stop the map-output fetcher threads
for (Fetcher<K,V> fetcher : fetchers) {
fetcher.shutDown();
}
fetchers = null;
// stop the scheduler
scheduler.close();
copyPhase.complete(); // copy is already complete
taskStatus.setPhase(TaskStatus.Phase.SORT);
reduceTask.statusUpdate(umbilical);
// Finish the class="java"> public boolean next() throws IOException {
if (size() == 0)
return false;
if (minSegment != null) {
//minSegment is non-null for all invocations of next except the first
//one. For the first invocation, the priority queue is ready for use
//but for the subsequent invocations, first adjust the queue
adjustPriorityQueue(minSegment);
if (size() == 0) {
minSegment = null;
return false;
}
}
minSegment = top();
if (!minSegment.inMemory()) {
//When we load the value from an inmemory segment, we reset
//the "value" DIB in this class to the inmem segment's byte[].
//When we load the value bytes from disk, we shouldn't use
//the same byte[] since it would corrupt the data in the inmem
//segment. So we maintain an explicit DIB for value bytes
//obtained from disk, and if the current segment is a disk
//segment, we reset the "value" DIB to the byte[] in that (so
//we reuse the disk segment DIB whenever we consider
//a disk segment).
value.reset(diskIFileValue.getData(), diskIFileValue.getLength());
}
long startPos = minSegment.getPosition();
key = minSegment.getKey();
minSegment.getValue(value);
long endPos = minSegment.getPosition();
totalBytesProcessed += endPos - startPos;
mergeProgress.set(totalBytesProcessed * progPerByte);
return true;
}