[Hadoop]使用DistributedCache进行复制联结

本文深入探讨了Hadoop MapReduce框架中DistributedCache的原理及其在数据处理过程中的应用,详细解释了如何利用DistributedCache进行复制连接,并分析了其在减少网络带宽消耗和提高处理效率方面的优势。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package com.yc.test3;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Scanner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;

public class MapClass extends Mapper<LongWritable, Text, Text, IntWritable> {
    /**
     * @param  张志刚 2015年9月2
     * */
      private Path localFiles[] = null;

    public void map(LongWritable ikey, Text ivalue, Context context) throws IOException, InterruptedException {
        System.out.println("map里输出了");   
         String val=ivalue.toString();
            String str[]=val.split(" ");
            for(String s:str){
                context.write(new Text(s),new IntWritable(1));
            }
    }
    @Override
    protected void setup(Context context)
            throws IOException, InterruptedException {
          Configuration conf = context.getConfiguration();
          try {
              localFiles = DistributedCache.getLocalCacheFiles(conf);
              System.out.println("获取的路径是:  "+localFiles[0].toString());
          } catch (IOException e) {
              // TODO Auto-generated catch block
              e.printStackTrace();
          }
          FileSystem fsopen= FileSystem.getLocal(conf);  
          FSDataInputStream in = fsopen.open(localFiles[0]);   
          Scanner scan=new Scanner(in);  
          while(scan.hasNext()){  
              System.out.println(Thread.currentThread().getName()+"扫描的内容:  "+scan.next());  
          }  
          scan.close();   
    }

}
package com.yc.test3;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Drive2 {
/**
 * @param  张志刚 2015年9月2
 * */
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Path in = new Path("hdfs://localhost:9000/user/input/README.txt");
        Path out = new Path("hdfs://localhost:9000/user/output/test1");
        Job job = Job.getInstance(conf, "JobName");
        DistributedCache.addCacheFile(in.toUri(),job.getConfiguration());
        FileSystem fs=FileSystem.get(conf); 
        if(fs.exists(out)){  
            fs.delete(out, true);  
            System.out.println("输出路径存在,已删除!");  
        }  
        job.setJarByClass(com.yc.test3.Drive2.class);
        // TODO: specify a mapper
        job.setMapperClass(MapClass.class);
        // TODO: specify a reducer
        job.setReducerClass(Reduce.class);
        // TODO: specify output types
        job.setOutputKeyClass(Text.class);
          job.setMapOutputValueClass(IntWritable.class);
        // TODO: specify input and output DIRECTORIES (not files)
        FileInputFormat.setInputPaths(job,in);
        FileOutputFormat.setOutputPath(job, out);
        if (!job.waitForCompletion(true))
            return;
    }

}
package com.yc.test3;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class Reduce extends Reducer<Text, IntWritable, Text,IntWritable> {
    /**
     * @param  张志刚 2015年9月2
     * */
    public void reduce(Text key, Iterable<IntWritable> values,

            Context context) throws IOException, InterruptedException {
        // process values
         int sum=0;

            for(IntWritable val:values){

                sum+=val.get();

            }
            context.write(key,new IntWritable(sum));
    }

}
 WARN [main] - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
输出路径存在,已删除!
 INFO [main] - session.id is deprecated. Instead, use dfs.metrics.session-id
 INFO [main] - Initializing JVM Metrics with processName=JobTracker, sessionId=
 WARN [main] - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
 WARN [main] - No job jar file set.  User classes may not be found. See Job or Job#setJar(String).
 INFO [main] - Total input paths to process : 1
 INFO [main] - number of splits:1
 INFO [main] - Submitting tokens for job: job_local159353495_0001
 INFO [main] - Creating symlink: /usr/local/application/hadoop-2.6.0/tmp/mapred/local/1441131372887/README.txt <- /home/a/workspace/Hadoop2/README.txt
 INFO [main] - Localized hdfs://localhost:9000/user/input/README.txt as file:/usr/local/application/hadoop-2.6.0/tmp/mapred/local/1441131372887/README.txt
 INFO [main] - The url to track the job: http://localhost:8080/
 INFO [main] - Running job: job_local159353495_0001
 INFO [Thread-12] - OutputCommitter set in config null
 INFO [Thread-12] - OutputCommitter is org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 INFO [Thread-12] - Waiting for map tasks
 INFO [LocalJobRunner Map Task Executor #0] - Starting task: attempt_local159353495_0001_m_000000_0
 INFO [LocalJobRunner Map Task Executor #0] -  Using ResourceCalculatorProcessTree : [ ]
 INFO [LocalJobRunner Map Task Executor #0] - Processing split: hdfs://localhost:9000/user/input/README.txt:0+1366
 INFO [main] - Job job_local159353495_0001 running in uber mode : false
 INFO [LocalJobRunner Map Task Executor #0] - (EQUATOR) 0 kvi 26214396(104857584)
 INFO [LocalJobRunner Map Task Executor #0] - mapreduce.task.io.sort.mb: 100
 INFO [LocalJobRunner Map Task Executor #0] - soft limit at 83886080
 INFO [LocalJobRunner Map Task Executor #0] - bufstart = 0; bufvoid = 104857600
 INFO [LocalJobRunner Map Task Executor #0] - kvstart = 26214396; length = 6553600
 INFO [main] -  map 0% reduce 0%
 INFO [LocalJobRunner Map Task Executor #0] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
获取的路径是:  file:/usr/local/application/hadoop-2.6.0/tmp/mapred/local/1441131372887/README.txt
LocalJobRunner Map Task Executor #0扫描的内容:  For
LocalJobRunner Map Task Executor #0扫描的内容:  the
LocalJobRunner Map Task Executor #0扫描的内容:  latest
LocalJobRunner Map Task Executor #0扫描的内容:  information
LocalJobRunner Map Task Executor #0扫描的内容:  about
LocalJobRunner Map Task Executor #0扫描的内容:  Hadoop,
LocalJobRunner Map Task Executor #0扫描的内容:  please
LocalJobRunner Map Task Executor #0扫描的内容:  visit
LocalJobRunner Map Task Executor #0扫描的内容:  our
LocalJobRunner Map Task Executor #0扫描的内容:  website
LocalJobRunner Map Task Executor #0扫描的内容:  at:
LocalJobRunner Map Task Executor #0扫描的内容:  http://hadoop.apache.org/core/
LocalJobRunner Map Task Executor #0扫描的内容:  and
LocalJobRunner Map Task Executor #0扫描的内容:  our
LocalJobRunner Map Task Executor #0扫描的内容:  wiki,
LocalJobRunner Map Task Executor #0扫描的内容:  at:
LocalJobRunner Map Task Executor #0扫描的内容:  http://wiki.apache.org/hadoop/
LocalJobRunner Map Task Executor #0扫描的内容:  This
LocalJobRunner Map Task Executor #0扫描的内容:  distribution
LocalJobRunner Map Task Executor #0扫描的内容:  includes
LocalJobRunner Map Task Executor #0扫描的内容:  cryptographic
LocalJobRunner Map Task Executor #0扫描的内容:  software.
LocalJobRunner Map Task Executor #0扫描的内容:  The
LocalJobRunner Map Task Executor #0扫描的内容:  country
LocalJobRunner Map Task Executor #0扫描的内容:  in
LocalJobRunner Map Task Executor #0扫描的内容:  which
LocalJobRunner Map Task Executor #0扫描的内容:  you
LocalJobRunner Map Task Executor #0扫描的内容:  currently
LocalJobRunner Map Task Executor #0扫描的内容:  reside
LocalJobRunner Map Task Executor #0扫描的内容:  may
LocalJobRunner Map Task Executor #0扫描的内容:  have
LocalJobRunner Map Task Executor #0扫描的内容:  restrictions
LocalJobRunner Map Task Executor #0扫描的内容:  on
LocalJobRunner Map Task Executor #0扫描的内容:  the
LocalJobRunner Map Task Executor #0扫描的内容:  import,
LocalJobRunner Map Task Executor #0扫描的内容:  possession,
LocalJobRunner Map Task Executor #0扫描的内容:  use,
LocalJobRunner Map Task Executor #0扫描的内容:  and/or
LocalJobRunner Map Task Executor #0扫描的内容:  re-export
LocalJobRunner Map Task Executor #0扫描的内容:  to
LocalJobRunner Map Task Executor #0扫描的内容:  another
LocalJobRunner Map Task Executor #0扫描的内容:  country,
LocalJobRunner Map Task Executor #0扫描的内容:  of
LocalJobRunner Map Task Executor #0扫描的内容:  encryption
LocalJobRunner Map Task Executor #0扫描的内容:  software.
LocalJobRunner Map Task Executor #0扫描的内容:  BEFORE
LocalJobRunner Map Task Executor #0扫描的内容:  using
LocalJobRunner Map Task Executor #0扫描的内容:  any
LocalJobRunner Map Task Executor #0扫描的内容:  encryption
LocalJobRunner Map Task Executor #0扫描的内容:  software,
LocalJobRunner Map Task Executor #0扫描的内容:  please
LocalJobRunner Map Task Executor #0扫描的内容:  check
LocalJobRunner Map Task Executor #0扫描的内容:  your
LocalJobRunner Map Task Executor #0扫描的内容:  country's
LocalJobRunner Map Task Executor #0扫描的内容:  laws,
LocalJobRunner Map Task Executor #0扫描的内容:  regulations
LocalJobRunner Map Task Executor #0扫描的内容:  and
LocalJobRunner Map Task Executor #0扫描的内容:  policies
LocalJobRunner Map Task Executor #0扫描的内容:  concerning
LocalJobRunner Map Task Executor #0扫描的内容:  the
LocalJobRunner Map Task Executor #0扫描的内容:  import,
LocalJobRunner Map Task Executor #0扫描的内容:  possession,
LocalJobRunner Map Task Executor #0扫描的内容:  or
LocalJobRunner Map Task Executor #0扫描的内容:  use,
LocalJobRunner Map Task Executor #0扫描的内容:  and
LocalJobRunner Map Task Executor #0扫描的内容:  re-export
LocalJobRunner Map Task Executor #0扫描的内容:  of
LocalJobRunner Map Task Executor #0扫描的内容:  encryption
LocalJobRunner Map Task Executor #0扫描的内容:  software,
LocalJobRunner Map Task Executor #0扫描的内容:  to
LocalJobRunner Map Task Executor #0扫描的内容:  see
LocalJobRunner Map Task Executor #0扫描的内容:  if
LocalJobRunner Map Task Executor #0扫描的内容:  this
LocalJobRunner Map Task Executor #0扫描的内容:  is
LocalJobRunner Map Task Executor #0扫描的内容:  permitted.
LocalJobRunner Map Task Executor #0扫描的内容:  See
LocalJobRunner Map Task Executor #0扫描的内容:  <http://www.wassenaar.org/>
LocalJobRunner Map Task Executor #0扫描的内容:  for
LocalJobRunner Map Task Executor #0扫描的内容:  more
LocalJobRunner Map Task Executor #0扫描的内容:  information.
LocalJobRunner Map Task Executor #0扫描的内容:  The
LocalJobRunner Map Task Executor #0扫描的内容:  U.S.
LocalJobRunner Map Task Executor #0扫描的内容:  Government
LocalJobRunner Map Task Executor #0扫描的内容:  Department
LocalJobRunner Map Task Executor #0扫描的内容:  of
LocalJobRunner Map Task Executor #0扫描的内容:  Commerce,
LocalJobRunner Map Task Executor #0扫描的内容:  Bureau
LocalJobRunner Map Task Executor #0扫描的内容:  of
LocalJobRunner Map Task Executor #0扫描的内容:  Industry
LocalJobRunner Map Task Executor #0扫描的内容:  and
LocalJobRunner Map Task Executor #0扫描的内容:  Security
LocalJobRunner Map Task Executor #0扫描的内容:  (BIS),
LocalJobRunner Map Task Executor #0扫描的内容:  has
LocalJobRunner Map Task Executor #0扫描的内容:  classified
LocalJobRunner Map Task Executor #0扫描的内容:  this
LocalJobRunner Map Task Executor #0扫描的内容:  software
LocalJobRunner Map Task Executor #0扫描的内容:  as
LocalJobRunner Map Task Executor #0扫描的内容:  Export
LocalJobRunner Map Task Executor #0扫描的内容:  Commodity
LocalJobRunner Map Task Executor #0扫描的内容:  Control
LocalJobRunner Map Task Executor #0扫描的内容:  Number
LocalJobRunner Map Task Executor #0扫描的内容:  (ECCN)
LocalJobRunner Map Task Executor #0扫描的内容:  5D002.C.1,
LocalJobRunner Map Task Executor #0扫描的内容:  which
LocalJobRunner Map Task Executor #0扫描的内容:  includes
LocalJobRunner Map Task Executor #0扫描的内容:  information
LocalJobRunner Map Task Executor #0扫描的内容:  security
LocalJobRunner Map Task Executor #0扫描的内容:  software
LocalJobRunner Map Task Executor #0扫描的内容:  using
LocalJobRunner Map Task Executor #0扫描的内容:  or
LocalJobRunner Map Task Executor #0扫描的内容:  performing
LocalJobRunner Map Task Executor #0扫描的内容:  cryptographic
LocalJobRunner Map Task Executor #0扫描的内容:  functions
LocalJobRunner Map Task Executor #0扫描的内容:  with
LocalJobRunner Map Task Executor #0扫描的内容:  asymmetric
LocalJobRunner Map Task Executor #0扫描的内容:  algorithms.
LocalJobRunner Map Task Executor #0扫描的内容:  The
LocalJobRunner Map Task Executor #0扫描的内容:  form
LocalJobRunner Map Task Executor #0扫描的内容:  and
LocalJobRunner Map Task Executor #0扫描的内容:  manner
LocalJobRunner Map Task Executor #0扫描的内容:  of
LocalJobRunner Map Task Executor #0扫描的内容:  this
LocalJobRunner Map Task Executor #0扫描的内容:  Apache
LocalJobRunner Map Task Executor #0扫描的内容:  Software
LocalJobRunner Map Task Executor #0扫描的内容:  Foundation
LocalJobRunner Map Task Executor #0扫描的内容:  distribution
LocalJobRunner Map Task Executor #0扫描的内容:  makes
LocalJobRunner Map Task Executor #0扫描的内容:  it
LocalJobRunner Map Task Executor #0扫描的内容:  eligible
LocalJobRunner Map Task Executor #0扫描的内容:  for
LocalJobRunner Map Task Executor #0扫描的内容:  export
LocalJobRunner Map Task Executor #0扫描的内容:  under
LocalJobRunner Map Task Executor #0扫描的内容:  the
LocalJobRunner Map Task Executor #0扫描的内容:  License
LocalJobRunner Map Task Executor #0扫描的内容:  Exception
LocalJobRunner Map Task Executor #0扫描的内容:  ENC
LocalJobRunner Map Task Executor #0扫描的内容:  Technology
LocalJobRunner Map Task Executor #0扫描的内容:  Software
LocalJobRunner Map Task Executor #0扫描的内容:  Unrestricted
LocalJobRunner Map Task Executor #0扫描的内容:  (TSU)
LocalJobRunner Map Task Executor #0扫描的内容:  exception
LocalJobRunner Map Task Executor #0扫描的内容:  (see
LocalJobRunner Map Task Executor #0扫描的内容:  the
LocalJobRunner Map Task Executor #0扫描的内容:  BIS
LocalJobRunner Map Task Executor #0扫描的内容:  Export
LocalJobRunner Map Task Executor #0扫描的内容:  Administration
LocalJobRunner Map Task Executor #0扫描的内容:  Regulations,
LocalJobRunner Map Task Executor #0扫描的内容:  Section
LocalJobRunner Map Task Executor #0扫描的内容:  740.13)
LocalJobRunner Map Task Executor #0扫描的内容:  for
LocalJobRunner Map Task Executor #0扫描的内容:  both
LocalJobRunner Map Task Executor #0扫描的内容:  object
LocalJobRunner Map Task Executor #0扫描的内容:  code
LocalJobRunner Map Task Executor #0扫描的内容:  and
LocalJobRunner Map Task Executor #0扫描的内容:  source
LocalJobRunner Map Task Executor #0扫描的内容:  code.
LocalJobRunner Map Task Executor #0扫描的内容:  The
LocalJobRunner Map Task Executor #0扫描的内容:  following
LocalJobRunner Map Task Executor #0扫描的内容:  provides
LocalJobRunner Map Task Executor #0扫描的内容:  more
LocalJobRunner Map Task Executor #0扫描的内容:  details
LocalJobRunner Map Task Executor #0扫描的内容:  on
LocalJobRunner Map Task Executor #0扫描的内容:  the
LocalJobRunner Map Task Executor #0扫描的内容:  included
LocalJobRunner Map Task Executor #0扫描的内容:  cryptographic
LocalJobRunner Map Task Executor #0扫描的内容:  software:
LocalJobRunner Map Task Executor #0扫描的内容:  Hadoop
LocalJobRunner Map Task Executor #0扫描的内容:  Core
LocalJobRunner Map Task Executor #0扫描的内容:  uses
LocalJobRunner Map Task Executor #0扫描的内容:  the
LocalJobRunner Map Task Executor #0扫描的内容:  SSL
LocalJobRunner Map Task Executor #0扫描的内容:  libraries
LocalJobRunner Map Task Executor #0扫描的内容:  from
LocalJobRunner Map Task Executor #0扫描的内容:  the
LocalJobRunner Map Task Executor #0扫描的内容:  Jetty
LocalJobRunner Map Task Executor #0扫描的内容:  project
LocalJobRunner Map Task Executor #0扫描的内容:  written
LocalJobRunner Map Task Executor #0扫描的内容:  by
LocalJobRunner Map Task Executor #0扫描的内容:  mortbay.org.
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
map里输出了
 INFO [LocalJobRunner Map Task Executor #0] - 
 INFO [LocalJobRunner Map Task Executor #0] - Starting flush of map output
 INFO [LocalJobRunner Map Task Executor #0] - Spilling map output
 INFO [LocalJobRunner Map Task Executor #0] - bufstart = 0; bufend = 2145; bufvoid = 104857600
 INFO [LocalJobRunner Map Task Executor #0] - kvstart = 26214396(104857584); kvend = 26213612(104854448); length = 785/6553600
 INFO [LocalJobRunner Map Task Executor #0] - Finished spill 0
 INFO [LocalJobRunner Map Task Executor #0] - Task:attempt_local159353495_0001_m_000000_0 is done. And is in the process of committing
 INFO [LocalJobRunner Map Task Executor #0] - map
 INFO [LocalJobRunner Map Task Executor #0] - Task 'attempt_local159353495_0001_m_000000_0' done.
 INFO [LocalJobRunner Map Task Executor #0] - Finishing task: attempt_local159353495_0001_m_000000_0
 INFO [Thread-12] - map task executor complete.
 INFO [Thread-12] - Waiting for reduce tasks
 INFO [pool-6-thread-1] - Starting task: attempt_local159353495_0001_r_000000_0
 INFO [pool-6-thread-1] -  Using ResourceCalculatorProcessTree : [ ]
 INFO [pool-6-thread-1] - Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@363cef1d
 INFO [pool-6-thread-1] - MergerManager: memoryLimit=503893184, maxSingleShuffleLimit=125973296, mergeThreshold=332569504, ioSortFactor=10, memToMemMergeOutputsThreshold=10
 INFO [EventFetcher for fetching Map Completion Events] - attempt_local159353495_0001_r_000000_0 Thread started: EventFetcher for fetching Map Completion Events
 INFO [localfetcher#1] - localfetcher#1 about to shuffle output of map attempt_local159353495_0001_m_000000_0 decomp: 2541 len: 2545 to MEMORY
 INFO [localfetcher#1] - Read 2541 bytes from map-output for attempt_local159353495_0001_m_000000_0
 INFO [localfetcher#1] - closeInMemoryFile -> map-output of size: 2541, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->2541
 INFO [EventFetcher for fetching Map Completion Events] - EventFetcher is interrupted.. Returning
 INFO [pool-6-thread-1] - 1 / 1 copied.
 INFO [pool-6-thread-1] - finalMerge called with 1 in-memory map-outputs and 0 on-disk map-outputs
 INFO [pool-6-thread-1] - Merging 1 sorted segments
 INFO [pool-6-thread-1] - Down to the last merge-pass, with 1 segments left of total size: 2538 bytes
 INFO [pool-6-thread-1] - Merged 1 segments, 2541 bytes to disk to satisfy reduce memory limit
 INFO [pool-6-thread-1] - Merging 1 files, 2545 bytes from disk
 INFO [pool-6-thread-1] - Merging 0 segments, 0 bytes from memory into reduce
 INFO [pool-6-thread-1] - Merging 1 sorted segments
 INFO [pool-6-thread-1] - Down to the last merge-pass, with 1 segments left of total size: 2538 bytes
 INFO [pool-6-thread-1] - 1 / 1 copied.
 INFO [pool-6-thread-1] - mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords
 INFO [main] -  map 100% reduce 0%
 INFO [pool-6-thread-1] - Task:attempt_local159353495_0001_r_000000_0 is done. And is in the process of committing
 INFO [pool-6-thread-1] - 1 / 1 copied.
 INFO [pool-6-thread-1] - Task attempt_local159353495_0001_r_000000_0 is allowed to commit now
 INFO [pool-6-thread-1] - Saved output of task 'attempt_local159353495_0001_r_000000_0' to hdfs://localhost:9000/user/output/test1/_temporary/0/task_local159353495_0001_r_000000
 INFO [pool-6-thread-1] - reduce > reduce
 INFO [pool-6-thread-1] - Task 'attempt_local159353495_0001_r_000000_0' done.
 INFO [pool-6-thread-1] - Finishing task: attempt_local159353495_0001_r_000000_0
 INFO [Thread-12] - reduce task executor complete.
 INFO [main] -  map 100% reduce 100%
 INFO [main] - Job job_local159353495_0001 completed successfully
 INFO [main] - Counters: 38
    File System Counters
        FILE: Number of bytes read=8166
        FILE: Number of bytes written=535401
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=5464
        HDFS: Number of bytes written=1310
        HDFS: Number of read operations=37
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=6
    Map-Reduce Framework
        Map input records=31
        Map output records=197
        Map output bytes=2145
        Map output materialized bytes=2545
        Input split bytes=108
        Combine input records=0
        Combine output records=0
        Reduce input groups=132
        Reduce shuffle bytes=2545
        Reduce input records=197
        Reduce output records=132
        Spilled Records=394
        Shuffled Maps =1
        Failed Shuffles=0
        Merged Map outputs=1
        GC time elapsed (ms)=81
        CPU time spent (ms)=0
        Physical memory (bytes) snapshot=0
        Virtual memory (bytes) snapshot=0
        Total committed heap usage (bytes)=249561088
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters 
        Bytes Read=1366
    File Output Format Counters 
        Bytes Written=1310

什么是DistributedCache进行复制联结,原理

DistributedCache的原理是将小的那个文件复制到所有节点上。
我们使用DistributedCache.addCacheFile()来设定要传播的文件,然后在mapper的初始化方法setup中用DistributedCache.getLocalCacheFiles()方法获取该文件并装入内存中。

为什么要用
上节讨论了Reduce侧联结,在Mapper中对记录进行包装后输出,中间结果经过网络重排后到达Reducer,combine()函数会对大量的数据进行筛除丢弃,这样以来Reduce侧联结浪费了宝贵的网络帶宽资源,而且处理效率低下。出于这点考虑,提出基于DistributedCache复制联结。

你可以将扫描得到的结果存到一个全局变量里,然后要用就从结合里面取数据。setup一次,map任务多次

public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
      while (context.nextKeyValue()) {
        map(context.getCurrentKey(), context.getCurrentValue(), context);
      }
    } finally {
      cleanup(context);
    }
  }
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值