4.MR老版流程源码解析

最新推荐文章于 2023-09-09 16:23:42 发布
qq_21292551
最新推荐文章于 2023-09-09 16:23:42 发布
阅读量430
点赞数
分类专栏： MapReduce 文章标签： hadoop mapreduce mapreduce
MapReduce 专栏收录该内容
6 篇文章
订阅专栏
一个完整的Hadoop MapReduce过程可以描述如下：
Client端提交MapReduce Job到JobTracker;
JobTracker调度Job, 生成MapTask和ReduceTask;
各TaskTracker接收MapTask和ReduceTask;
TaskTracker为MapTask和ReduceTask启动新的Child Task JVM;
Child Task JVM 运行MapTask或ReduceTask。
Child Task JVM 通过TaskTracker向JobTracker汇报进度和状态。
当JobTacker下所有的Task都成功时，Job标志为成功状态。
 
 JobClient 提交 MapReduce Job JobClient.runJob()方法一旦调用，MapReduce的大象就起跑了。
 钻入Job.submit()方法，找到一个有货的方法JobSubmitter.submitJobInternal()。
  JobStatus submitJobInternal(Job job, Cluster cluster) 
  throws ClassNotFoundException, InterruptedException, IOException {

    //检查job的规范
    checkSpecs(job);    
    Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, 
                                                     job.getConfiguration());
    //configure the command line options correctly class="java">  private <T extends InputSplit>
  int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException,
      InterruptedException, ClassNotFoundException {
    Configuration conf = job.getConfiguration();
    InputFormat<?, ?> input =
      ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    //调用InputFormat的Split方法，产生InputSplit
    List<InputSplit> splits = input.getSplits(job);
    T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);

    // sort the splits into order based class="java">  public void offerService() throws InterruptedException, IOException {
    // Prepare for recovery. This is done irrespective of the status of restart
    // flag.
    while (true) {
      try {
        recoveryManager.updateRestartCount();
        break;
      } catch (IOException ioe) {
        LOG.warn("Failed to initialize recovery manager. ", ioe);
        // wait for some time
        Thread.sleep(FS_ACCESS_RETRY_PERIOD);
        LOG.warn("Retrying...");
      }
    }

    taskScheduler.start();
    
    recoveryManager.recover();
    
    // refresh the node list as the recovery manager might have added 
    // disallowed trackers
    refreshHosts();
    
    startExpireTrackersThread();

    expireLaunchingTaskThread.start();

    if (completedJobStatusStore.isActive()) {
      completedJobsStoreThread = new Thread(completedJobStatusStore,
                                            "completedjobsStore-housekeeper");
      completedJobsStoreThread.start();
    }

    // start the inter-tracker server class="java">  private JobStatus submitJob(org.apache.hadoop.mapreduce.JobID jobID, 
                              int restartCount, UserGroupInformation ugi, 
                              String jobSubmitDir, boolean recovered, Credentials ts
                              )
      throws IOException, InterruptedException {

     ...
    // Create the JobInProgress, temporarily unlock the JobTracker since
    // we are about to copy job.xml from HDFSJobInProgress
    JobInProgress job =
        new JobInProgress(this, this.conf, restartCount, jobInfo, ts);

    synchronized (this) {
      ...
      return addJob(jobId, job);
    }
  }
 
    JobInitManager将Job分解为Tasks，并加入队列 
  
  class JobInitManager implements Runnable {
   
    public void run() {
      JobInProgress job = null;
      while (true) {
        try {
          synchronized (jobInitQueue) {
            while (jobInitQueue.isEmpty()) {
              jobInitQueue.wait();
            }
            job = jobInitQueue.remove(0);
          }
          threadPool.execute(new InitJob(job));
        } catch (InterruptedException t) {
          LOG.info("JobInitManagerThread interrupted.");
          break;
        } 
      }
      LOG.info("Shutting down thread pool");
      threadPool.shutdownNow();
    }
  }
 
  调用JobInProcess.initTasks()函数，为MapTask和ReduceTask生成多个TasksInProgress对象 
 
  public synchronized void initTasks() 
  throws IOException, KillInterruptedException, UnknownHostException {
    ...

    createMapTasks(jobFile.toString(), taskSplitMetaInfo);
    
    ...
        
    // set the launch time
    this.launchTime = JobTracker.getClock().getTime();

    createReduceTasks(jobFile.toString());
    
    ...
  }
 JobTracker给TaskTracker分配任务
  JobTracker在heatBeat()方法中，调用JobQueueTaskScheduler.assignTasks(TaskTracker taskTracker)函数,并将Task包含在HeartbeatResponse里返回。
 
   public synchronized  
   HeartbeatResponse heartbeat(TaskTrackerStatus status,  
   
                                                   boolean restarted, 
   
                                                   boolean initialContact, 
   
                                                   boolean acceptNewTasks,  
   
                                                   short responseId)  
   
     throws IOException { 
   
     ...    
   
     // Process this heartbeat  
   
     short newResponseId = (short)(responseId + 1); 
   
     status.setLastSeen(now); 
   
     if (!processHeartbeat(status, initialContact)) { 
   
       if (prevHeartbeatResponse != null) { 
   
         trackerToHeartbeatResponseMap.remove(trackerName); 
   
       } 
   
       return new HeartbeatResponse(newResponseId,  
   
                    new TaskTrackerAction[] {new ReinitTrackerAction()}); 
   
     } 
   
     // Initialize the response to be sent for the heartbeat 
   
     HeartbeatResponse response = new HeartbeatResponse(newResponseId, null); 
   
     List<TaskTrackerAction> actions = new ArrayList<TaskTrackerAction>(); 
   
     isBlacklisted = faultyTrackers.isBlacklisted(status.getHost()); 
   
     // Check for new tasks to be executed class="java">  /** 
   
    * The server retry loop.  
   
    * This while-loop attempts to connect to the JobTracker.  It class="java">State offerService() throws Exception { 
   
     long lastHeartbeat = 0; 
   
     while (running && !shuttingDown) { 
   
       try { 
   
         ... 
   
         // Send the heartbeat and process the jobtracker's directives 
   
         HeartbeatResponse heartbeatResponse = transmitHeartBeat(now); 
   
         TaskTrackerAction[] actions = heartbeatResponse.getActions(); 
   
         ... 
   
         if (actions != null){  
   
           for(TaskTrackerAction action: actions) { 
   
             if (action instanceof LaunchTaskAction) { 
   
               addToTaskQueue((LaunchTaskAction)action); 
   
             } else if (action instanceof CommitTaskAction) { 
   
               CommitTaskAction commitAction = (CommitTaskAction)action; 
   
               if (!commitResponses.contains(commitAction.getTaskID())) { 
   
                 LOG.info("Received commit task action for " +  
   
                           commitAction.getTaskID()); 
   
                 commitResponses.add(commitAction.getTaskID()); 
   
               } 
   
             } else { 
   
               tasksToCleanup.put(action); 
   
             } 
   
           } 
   
         } 
   
         markUnresponsiveTasks(); 
   
         killOverflowingTasks(); 
   
         //we've cleaned up, resume normal operation 
   
         if (!acceptNewTasks && isIdle()) { 
   
           acceptNewTasks=true; 
   
         } 
   
         ... 
   
       } 
   
     } 
   
     return State.NORMAL; 
   
       } 
   
 TaskLauncher thread不断轮询tasksToLaunch队列，当有Slots 可用时，就调用launchTask()，着手启动Task了。
 
     public void run() { 
   
       while (!Thread.interrupted()) { 
   
         try { 
   
           TaskInProgress tip; 
   
           Task task; 
   
           synchronized (tasksToLaunch) { 
   
             while (tasksToLaunch.isEmpty()) { 
   
               tasksToLaunch.wait(); 
   
             } 
   
             //get the TIP 
   
             tip = tasksToLaunch.remove(0); 
   
             task = tip.getTask(); 
   
             LOG.info("Trying to launch : " + tip.getTask().getTaskID() +  
   
                      " which needs " + task.getNumSlotsRequired() + " slots"); 
   
           } 
   
           //wait for free slots to run 
   
           synchronized (numFreeSlots) { 
   
             boolean canLaunch = true; 
   
             while (numFreeSlots.get() < task.getNumSlotsRequired()) { 
   
               //Make sure that there is no kill task action for this task! 
   
               //We are not locking tip here, because it would reverse the 
   
               //locking order! 
   
               //Also, Lock for the tip is not required here! because : 
   
               // 1. runState of TaskStatus is volatile 
   
               // 2. Any notification is not missed because notification is 
   
               // synchronized class="java">    try { 
   
       while (true) { 
   
         taskid = null; 
   
         JvmTask myTask = umbilical.getTask(context); 
   
         if (myTask.shouldDie()) { 
   
           break; 
   
         } else { 
   
           if (myTask.getTask() == null) { 
   
             taskid = null; 
   
             if (++idleLoopCount >= SLEEP_LONGER_COUNT) { 
   
               //we sleep for a bigger interval when we don't receive 
   
               //tasks for a while 
   
               Thread.sleep(1500); 
   
             } else { 
   
               Thread.sleep(500); 
   
             } 
   
             continue; 
   
           } 
   
         } 
   
         idleLoopCount = 0; 
   
         task = myTask.getTask(); 
   
 ... 
   
         final Task taskFinal = task; 
   
         childUGI.doAs(new PrivilegedExceptionAction<Object>() { 
   
           @Override 
   
           public Object run() throws Exception { 
   
             try { 
   
               // use job-specified working directory 
   
               FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory()); 
   
               taskFinal.run(job, umbilical);             // run the task 
   
             } finally { 
   
               TaskLog.syncLogs(logLocation, taskid, isCleanup); 
   
             }            
   
             return null; 
   
           } 
   
         }); 
   
 ... 
   
       } 
   
     } 
   
   } 
   
     } 
   
    本文描述的是MapReduce New API的部分 
   
    运行MapTask 
   
    Map阶段比较容易理解。 
   
    MapTask在run()方法中，调用runNewMapper()方法， 
   
      @SuppressWarnings("unchecked") 
   
   private <INKEY,INVALUE,OUTKEY,OUTVALUE> 
   
   void  
   runNewMapper(final JobConf job, 
   
                     final TaskSplitIndex splitIndex, 
   
                     final TaskUmbilicalProtocol umbilical, 
   
                     TaskReporter reporter 
   
                     ) throws IOException, ClassNotFoundException, 
   
                              InterruptedException { 
   
     // make a task context so we can get the classes 
   
     org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = 
   
       new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,  
   
                                                                   getTaskID(), 
   
                                                                   reporter); 
   
   // make a mapper 
   
     org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper = 
   
       (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>) 
   
         ReflectionUtils.newInstance(taskContext.getMapperClass(), job); 
   
     // make the input format 
   
     org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat = 
   
       (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>) 
   
         ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job); 
   
     // rebuild the input split 
   
     org.apache.hadoop.mapreduce.InputSplit split = null; 
   
     split = getSplitDetails(new Path(splitIndex.getSplitLocation()), 
   
         splitIndex.getStartOffset()); 
   
     org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input = 
   
       new NewTrackingRecordReader<INKEY,INVALUE> 
   
         (split, inputFormat, reporter, taskContext); 
   
     job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); 
   
     org.apache.hadoop.mapreduce.RecordWriter output = null; 
   
     // get an output object 
   
     if (job.getNumReduceTasks() == 0) { 
   
       output =  
   
         new NewDirectOutputCollector(taskContext, job, umbilical, reporter); 
   
     } else { 
   
       output = new NewOutputCollector(taskContext, job, umbilical, reporter); 
   
     } 
   
     org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE>  
   
     mapContext =  
   
       new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(),  
   
           input, output,  
   
           committer,  
   
           reporter, split); 
   
     org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context  
   
         mapperContext =  
   
           new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext( 
   
               mapContext); 
   
     input.initialize(split, mapperContext); 
   
     mapper.run(mapperContext); 
   
     mapPhase.complete(); 
   
     setPhase(TaskStatus.Phase.SORT); 
   
     statusUpdate(umbilical); 
   
     input.close(); 
   
     output.close(mapperContext); 
   
   } 
   
   runNewMapper中调用用户提交的Mapper类 
  
  /**
   * Expert users can override this method for more complete control over the
   * execution of the Mapper.
   * @param context
   * @throws IOException
   */
  public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    while (context.nextKeyValue()) {
      map(context.getCurrentKey(), context.getCurrentValue(), context);
    }
    cleanup(context);
  }
 
  Map结果到内存MapOutputBuffer 
 
  map的结果被context收集，默认的实现是收集在MapOutputBuffer中。MapOutputBuffer是通过循环数组来实现的。 
 
     /**
      * Serialize the key, value to intermediate storage.
      * When this method returns, kvindex must refer to sufficient unused
      * storage to store class="java">    private void sortAndSpill() throws IOException, ClassNotFoundException,
                                        InterruptedException {
       //approximate the length of the output file to be the length of the
       //buffer + header lengths for the partitions
       final long size = (bufend >= bufstart
           ? bufend - bufstart
           : (bufvoid - bufend) + bufstart) +
                   partitions * APPROX_HEADER_LENGTH;
       FSDataOutputStream out = null;
       try {
         // create spill file
         final SpillRecord spillRec = new SpillRecord(partitions);
         final Path filename =
             mapOutputFile.getSpillFileForWrite(numSpills, size);
         out = rfs.create(filename);
 
         final int mstart = kvend / NMETA;
         final int mend = 1 + // kvend is a valid record
           (kvstart >= kvend
           ? kvstart
           : kvmeta.capacity() + kvstart) / NMETA;
         sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
         int spindex = mstart; 
            final IndexRecord rec = new IndexRecord(); 
   
            final InMemValBytes value = 
      
    new 
      
    InMemValBytes(); 
   
        for (int i = 0; i < partitions; ++i) {
           IFile.Writer writer = null;
           try {
             long segmentStart = out.getPos();
             writer = new Writer(job, out, keyClass, valClass, codec,
                                       spilledRecordsCounter);
             if (combinerRunner == null) {
               // spill directly
               DataInputBuffer key = new DataInputBuffer();
               while (spindex < mend &&
                   kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
                 final int kvoff = offsetFor(spindex % maxRec);
                 key.reset(kvbuffer, kvmeta.get(kvoff + KEYSTART),
                           (kvmeta.get(kvoff + VALSTART) -
                            kvmeta.get(kvoff + KEYSTART)));
                 getVBytesForOffset(kvoff, value);
                 writer.append(key, value);
                 ++spindex;
               }
             } else {
               int spstart = spindex;
               while (spindex < mend &&
                   kvmeta.get(offsetFor(spindex % maxRec)
                             + PARTITION) == i) {
                 ++spindex;
               }
               // Note: we would like to avoid the combiner if we've fewer
               // than some threshold of records for a partition
               if (spstart != spindex) {
                 combineCollector.setWriter(writer);
                 RawKeyValueIterator kvIter =
                   new MRResultIterator(spstart, spindex);
                 combinerRunner.combine(kvIter, combineCollector);
               }
             }
 
             // close the writer
             writer.close();
 
             // record offsets
             rec.startOffset = segmentStart;
             rec.rawLength = writer.getRawLength();
             rec.partLength = writer.getCompressedLength();
             spillRec.putIndex(rec, i);
 
             writer = null;
           } finally {
             if (null != writer) writer.close();
           }
         }
 
         if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
           // create spill index file
           Path indexFilename =
               mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions
                   * MAP_OUTPUT_INDEX_RECORD_LENGTH);
           spillRec.writeToFile(indexFilename, job);
         } else {
           indexCacheList.add(spillRec);
           totalIndexCacheMemory +=
             spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
         }
         LOG.info("Finished spill " + numSpills);
         ++numSpills;
       } finally {
         if (out != null) out.close();
       }
     } 
    
 运行ReduceTask Reduce分为三个阶段，Copy, Sort, and Reduce。
 Reduce的输入是一个KeyValueInterator，其通过Shuffle类的run方法产生的
 
      public RawKeyValueIterator run() throws IOException, InterruptedException { 
   
     // Start the map-completion events fetcher thread 
   
     final EventFetcher<K,V> eventFetcher =  
   
       new EventFetcher<K,V>(reduceId, umbilical, scheduler, this); 
   
     eventFetcher.start(); 
   
     // Start the map-output fetcher threads 
   
     final int numFetchers = jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5); 
   
     Fetcher<K,V>[] fetchers = new Fetcher[numFetchers]; 
   
     for (int i=0; i < numFetchers; ++i) { 
   
       fetchers[i] = new Fetcher<K,V>(jobConf, reduceId, scheduler, merger,  
   
                                      reporter, metrics, this,  
   
                                      reduceTask.getJobTokenSecret()); 
   
       fetchers[i].start(); 
   
     } 
   
     // Wait for shuffle to complete successfully 
   
     while (!scheduler.waitUntilDone(PROGRESS_FREQUENCY)) { 
   
       reporter.progress(); 
   
       synchronized (this) { 
   
         if (throwable != null) { 
   
           throw new ShuffleError("error in shuffle in " + throwingThreadName, 
   
                                  throwable); 
   
         } 
   
       } 
   
     } 
   
     // Stop the event-fetcher thread 
   
     eventFetcher.shutDown(); 
   
     // Stop the map-output fetcher threads 
   
     for (Fetcher<K,V> fetcher : fetchers) { 
   
       fetcher.shutDown(); 
   
     } 
   
     fetchers = null; 
   
     // stop the scheduler 
   
     scheduler.close(); 
   
     copyPhase.complete(); // copy is already complete 
   
     taskStatus.setPhase(TaskStatus.Phase.SORT); 
   
     reduceTask.statusUpdate(umbilical); 
   
     // Finish the class="java">    public boolean next() throws IOException { 
   
       if (size() == 0) 
   
         return false; 
   
       if (minSegment != null) { 
   
         //minSegment is non-null for all invocations of next except the first 
   
         //one. For the first invocation, the priority queue is ready for use 
   
         //but for the subsequent invocations, first adjust the queue  
   
         adjustPriorityQueue(minSegment); 
   
         if (size() == 0) { 
   
           minSegment = null; 
   
           return false; 
   
         } 
   
       } 
   
       minSegment = top(); 
   
       if (!minSegment.inMemory()) { 
   
         //When we load the value from an inmemory segment, we reset 
   
         //the "value" DIB in this class to the inmem segment's byte[]. 
   
         //When we load the value bytes from disk, we shouldn't use 
   
         //the same byte[] since it would corrupt the data in the inmem 
   
         //segment. So we maintain an explicit DIB for value bytes 
   
         //obtained from disk, and if the current segment is a disk 
   
         //segment, we reset the "value" DIB to the byte[] in that (so  
   
         //we reuse the disk segment DIB whenever we consider 
   
         //a disk segment). 
   
         value.reset(diskIFileValue.getData(), diskIFileValue.getLength()); 
   
       } 
   
       long startPos = minSegment.getPosition(); 
   
       key = minSegment.getKey(); 
   
       minSegment.getValue(value); 
   
       long endPos = minSegment.getPosition(); 
   
       totalBytesProcessed += endPos - startPos; 
   
       mergeProgress.set(totalBytesProcessed * progPerByte); 
   
       return true; 
   
     }