一、开发前准备
hadoop:2.8.5
winutils:2.8.3
①在开发mapreduce之前,你需要在vmware安装hadoop,hadoop版本根据你安装的版本决定
②在windows上运行MapReduce程序,需要下载winutils,下载地址:https://github.com/steveloughran/winutils
③将winutils解压到对应的目录下,将bin中的hadoop.dll复制到C:\Windows\System32中
二、在idea通过maven搭建hadoop应用工程
第一步:新建maven工程,命名为hadoop-reduce
第二步:添加maven依赖
<!-- hadoop dependencies -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--hadoop end -->
第三步:在resources下添加core-site.xml
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://192.168.48.110:9000</value>
<description>HDFS的URI,文件系统://namenode标识:端口号</description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/hadoop/tmp</value>
<description>namenode上本地的hadoop临时文件夹</description>
</property>
</configuration>
三、开发MapReduce应用程序
package com.msa.hadoop.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;
import java.util.StringTokenizer;
/**
* wordcount测试调试类
*/
public class WordCount {
/**
* The type Tokenizer mapper.
*/
public static class TokenizerMapper extends
Mapper<Object, Text, Text, IntWritable> {
/**
* The constant one.
*/
public static final IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
this.word.set(itr.nextToken());
context.write(this.word, one);
}
}
}
/**
* The type Int sum reduce.
*/
public static class IntSumReduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
int sum = 0;
IntWritable val;
for (Iterator i = values.iterator(); i.hasNext(); sum += val.get()) {
val = (IntWritable) i.next();
}
this.result.set(sum);
context.write(key, this.result);
}
}
/**
* The entry point of application.
*
* @param args the input arguments
* @throws IOException the io exception
* @throws ClassNotFoundException the class not found exception
* @throws InterruptedException the interrupted exception
*/
public static void main(String[] args)
throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
if (args.length != 2) {
System.err.println("Usage:Merge and duplicate removal <in> <out>");
System.exit(2);
}
FileSystem fileSystem = FileSystem.get(URI.create(args[1]), conf);
if(fileSystem.exists(new Path(args[1]))) {
fileSystem.delete(new Path(args[1]), true);
}
Job job = Job.getInstance(conf, "WordCount");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCount.TokenizerMapper.class);
job.setReducerClass(WordCount.IntSumReduce.class);
job.setCombinerClass(WordCount.IntSumReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
四、测试调试
启动WordCount测试类,出现如下信息:
[INFO ] 2018-12-16 12:24:03,596 method:org.apache.hadoop.mapred.Task.done(Task.java:1139)
Final Counters for attempt_local2051717568_0001_m_000000_0: Counters: 23
File System Counters
FILE: Number of bytes read=162
FILE: Number of bytes written=372896
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=42
HDFS: Number of bytes written=0
HDFS: Number of read operations=6
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Map-Reduce Framework
Map input records=4
Map output records=5
Map output bytes=60
Map output materialized bytes=76
Input split bytes=111
Combine input records=5
Combine output records=5
Spilled Records=5
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=0
Total committed heap usage (bytes)=190316544
File Input Format Counters
Bytes Read=42
[INFO ] 2018-12-16 12:24:03,597 method:org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:276)
Finishing task: attempt_local2051717568_0001_m_000000_0
[INFO ] 2018-12-16 12:24:03,597 method:org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:483)
map task executor complete.
[INFO ] 2018-12-16 12:24:03,604 method:org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:475)
Waiting for reduce tasks
[INFO ] 2018-12-16 12:24:03,605 method:org.apache.hadoop.mapred.LocalJobRunner$Job$ReduceTaskRunnable.run(LocalJobRunner.java:329)
Starting task: attempt_local2051717568_0001_r_000000_0
[INFO ] 2018-12-16 12:24:03,620 method:org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.<init>(FileOutputCommitter.java:123)
File Output Committer Algorithm version is 1
[INFO ] 2018-12-16 12:24:03,621 method:org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.<init>(FileOutputCommitter.java:138)
FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
[INFO ] 2018-12-16 12:24:03,622 method:org.apache.hadoop.yarn.util.ProcfsBasedProcessTree.isAvailable(ProcfsBasedProcessTree.java:168)
ProcfsBasedProcessTree currently is supported only on Linux.
[INFO ] 2018-12-16 12:24:03,696 method:org.apache.hadoop.mapred.Task.initialize(Task.java:620)
Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@36e5a131
[INFO ] 2018-12-16 12:24:03,702 method:org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:362)
Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@16f5951f
[INFO ] 2018-12-16 12:24:03,754 method:org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.<init>(MergeManagerImpl.java:206)
MergerManager: memoryLimit=1153853056, maxSingleShuffleLimit=288463264, mergeThreshold=761543040, ioSortFactor=10, memToMemMergeOutputsThreshold=10
[INFO ] 2018-12-16 12:24:03,758 method:org.apache.hadoop.mapreduce.task.reduce.EventFetcher.run(EventFetcher.java:61)
attempt_local2051717568_0001_r_000000_0 Thread started: EventFetcher for fetching Map Completion Events
[INFO ] 2018-12-16 12:24:03,797 method:org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1411)
Job job_local2051717568_0001 running in uber mode : false
[INFO ] 2018-12-16 12:24:03,800 method:org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1418)
map 100% reduce 0%
[INFO ] 2018-12-16 12:24:03,814 method:org.apache.hadoop.mapreduce.task.reduce.LocalFetcher.copyMapOutput(LocalFetcher.java:145)
localfetcher#1 about to shuffle output of map attempt_local2051717568_0001_m_000000_0 decomp: 72 len: 76 to MEMORY
[INFO ] 2018-12-16 12:24:03,825 method:org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput.doShuffle(InMemoryMapOutput.java:93)
Read 72 bytes from map-output for attempt_local2051717568_0001_m_000000_0
[INFO ] 2018-12-16 12:24:03,828 method:org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.closeInMemoryFile(MergeManagerImpl.java:321)
closeInMemoryFile -> map-output of size: 72, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->72
[INFO ] 2018-12-16 12:24:03,831 method:org.apache.hadoop.mapreduce.task.reduce.EventFetcher.run(EventFetcher.java:76)
EventFetcher is interrupted.. Returning
[INFO ] 2018-12-16 12:24:03,832 method:org.apache.hadoop.mapred.LocalJobRunner$Job.statusUpdate(LocalJobRunner.java:618)
1 / 1 copied.
[INFO ] 2018-12-16 12:24:03,833 method:org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.finalMerge(MergeManagerImpl.java:693)
finalMerge called with 1 in-memory map-outputs and 0 on-disk map-outputs
[INFO ] 2018-12-16 12:24:03,850 method:org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:606)
Merging 1 sorted segments
[INFO ] 2018-12-16 12:24:03,851 method:org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:705)
Down to the last merge-pass, with 1 segments left of total size: 54 bytes
[INFO ] 2018-12-16 12:24:03,855 method:org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.finalMerge(MergeManagerImpl.java:760)
Merged 1 segments, 72 bytes to disk to satisfy reduce memory limit
[INFO ] 2018-12-16 12:24:03,857 method:org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.finalMerge(MergeManagerImpl.java:790)
Merging 1 files, 76 bytes from disk
[INFO ] 2018-12-16 12:24:03,858 method:org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.finalMerge(MergeManagerImpl.java:805)
Merging 0 segments, 0 bytes from memory into reduce
[INFO ] 2018-12-16 12:24:03,859 method:org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:606)
Merging 1 sorted segments
[INFO ] 2018-12-16 12:24:03,860 method:org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:705)
Down to the last merge-pass, with 1 segments left of total size: 54 bytes
[INFO ] 2018-12-16 12:24:03,861 method:org.apache.hadoop.mapred.LocalJobRunner$Job.statusUpdate(LocalJobRunner.java:618)
1 / 1 copied.
[INFO ] 2018-12-16 12:24:03,901 method:org.apache.hadoop.conf.Configuration.logDeprecation(Configuration.java:1285)
mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords
[INFO ] 2018-12-16 12:24:04,017 method:org.apache.hadoop.mapred.Task.done(Task.java:1105)
Task:attempt_local2051717568_0001_r_000000_0 is done. And is in the process of committing
[INFO ] 2018-12-16 12:24:04,022 method:org.apache.hadoop.mapred.LocalJobRunner$Job.statusUpdate(LocalJobRunner.java:618)
1 / 1 copied.
[INFO ] 2018-12-16 12:24:04,022 method:org.apache.hadoop.mapred.Task.commit(Task.java:1284)
Task attempt_local2051717568_0001_r_000000_0 is allowed to commit now
[INFO ] 2018-12-16 12:24:04,038 method:org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitTask(FileOutputCommitter.java:582)
Saved output of task 'attempt_local2051717568_0001_r_000000_0' to hdfs://192.168.48.110:9000/usr/file/output/_temporary/0/task_local2051717568_0001_r_000000
[INFO ] 2018-12-16 12:24:04,040 method:org.apache.hadoop.mapred.LocalJobRunner$Job.statusUpdate(LocalJobRunner.java:618)
reduce > reduce
[INFO ] 2018-12-16 12:24:04,040 method:org.apache.hadoop.mapred.Task.sendDone(Task.java:1243)
Task 'attempt_local2051717568_0001_r_000000_0' done.
[INFO ] 2018-12-16 12:24:04,041 method:org.apache.hadoop.mapred.Task.done(Task.java:1139)
Final Counters for attempt_local2051717568_0001_r_000000_0: Counters: 29
File System Counters
FILE: Number of bytes read=346
FILE: Number of bytes written=372972
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=42
HDFS: Number of bytes written=50
HDFS: Number of read operations=9
HDFS: Number of large read operations=0
HDFS: Number of write operations=4
Map-Reduce Framework
Combine input records=0
Combine output records=0
Reduce input groups=5
Reduce shuffle bytes=76
Reduce input records=5
Reduce output records=5
Spilled Records=5
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=17
Total committed heap usage (bytes)=210763776
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Output Format Counters
Bytes Written=50
[INFO ] 2018-12-16 12:24:04,041 method:org.apache.hadoop.mapred.LocalJobRunner$Job$ReduceTaskRunnable.run(LocalJobRunner.java:352)
Finishing task: attempt_local2051717568_0001_r_000000_0
[INFO ] 2018-12-16 12:24:04,041 method:org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:483)
reduce task executor complete.
[INFO ] 2018-12-16 12:24:04,801 method:org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1418)
map 100% reduce 100%
[INFO ] 2018-12-16 12:24:04,802 method:org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1429)
Job job_local2051717568_0001 completed successfully
[INFO ] 2018-12-16 12:24:04,823 method:org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1436)
Counters: 35
File System Counters
FILE: Number of bytes read=508
FILE: Number of bytes written=745868
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=84
HDFS: Number of bytes written=50
HDFS: Number of read operations=15
HDFS: Number of large read operations=0
HDFS: Number of write operations=6
Map-Reduce Framework
Map input records=4
Map output records=5
Map output bytes=60
Map output materialized bytes=76
Input split bytes=111
Combine input records=5
Combine output records=5
Reduce input groups=5
Reduce shuffle bytes=76
Reduce input records=5
Reduce output records=5
Spilled Records=10
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=17
Total committed heap usage (bytes)=401080320
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=42
File Output Format Counters
Bytes Written=50
Process finished with exit code 0