1.虚拟机:
VMware Workstation (安装目录E:\Program Files\vm)
hadoopMaster 192.168.119.129(如果ip有变化请修改etx/hosts)
hadoopSlave 192.168.119.130
2.WinSCP
配置WinSCP使其能链接到master,如果ip 不能ping通,关掉本地连接 打开虚拟机的网络设置
vm网络不能连接解决方法
开启管理工具--服务中的VMware DHCP Service和VMware NAT Service两个服务
开启管理工具--服务中的VMware DHCP Service和VMware NAT Service两个服务
3.运行流程
->打开虚拟机hadoopMaster hadoopSlave
->启动hadoop ./start-all.sh(保证俩台机器都启动,启动前检查是否有运行hadoop,用ps -ef | grep hadoop 查看是否运行,如果有运行的进程,则杀掉)
->文件准备参考hadoop/bin/aaaa.txt文件 上传hadoop ./hadoop fs -put aaaa.txt input 将aaaa.txt上传到hadoop的input目录
->在hadoop/bin目录下运行 ./hadoop jar -word.jar wordcount input output
->查看运行结果:./hadoop fs -cat output/part-r-00000
3.程序开发
->下载hadoop自带的hadoop-examples-*.jar和WordCount.java,源代码在src/examples下面
->修改WordCount.java 中的Mapper和Reduce
->将修改后的WordCount.java的编译文件class拷贝到hadoop-examples-*.jar里
->然后上传到hadoop
4.程序源代码
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* 实例
*
* 把文件
*npf||40
*zhz||20
*zhz||20
*zhz||20
*zhz||20
*zhz||20
*
* 转为
* npf 40
* zhz 460
*/
public class WordCount {
public static class TokenizerMapper extends
Mapper<Object, Text, Text, IntWritable> {
/**
* Object key map中的key,可以理解为行编码
* Text value 文件中的一行的全部记录
* Context context 上下文
*/
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//一行中的所有记录
String str = value.toString();
//用||分割成数组,如:npf||12,分割成{npf,12}
StringTokenizer itr = new StringTokenizer(str, "||");
//遍历
while (itr.hasMoreTokens()) {
//获取第一个值 npf
String mykey = itr.nextToken();
//获取第二个值 12
int myvalue = Integer.parseInt(itr.nextToken());
//设置map输出key value
context.write(new Text(mykey), new IntWritable(myvalue));
}
}
}
public static class IntSumReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
/**
* Text key reduce中的key,是map中定义的
* Iterable<IntWritable> values 同一key的所有值
* Context context 上下文
*/
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
//初始化合计
int sum = 0;
//循环values,做累加
for (IntWritable val : values) {
sum += val.get();
}
//将合计值付给reduce的输出value
result.set(sum);
//输出reduce的key value
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
// 读取hadoop配置
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
// 实例化一道作业
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
// Mapper类型
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
// Reducer类型
job.setReducerClass(IntSumReducer.class);
// rduce输出Key的类型,
job.setOutputKeyClass(Text.class);
// rduce输出Value的类型
job.setOutputValueClass(IntWritable.class);
// 输入hdfs路径
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
// 输出hdfs路径
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
// 提交job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}