MapReduce
总览
一.MapReduce
1.什么是MapReduce
- MapReduce是一个分布式计算框架
- 它将大型数据操作作业分解为可以跨服务器集群并行执行的单个任务。
- 起源于Google
- 适用于大规模数据处理场景
- 每个节点处理存储在该节点的数据
- 每个job包含Map和Reduce两部分
2.MapReduce的设计思想
- 分而治之
- 简化并行计算的编程模型
- 构建抽象模型:Map和Reduce
- 开发人员专注于实现Mapper和Reducer函数
- 隐藏系统层细节
- 开发人员专注于业务逻辑实现
3.MapReduce特点
- 优点
- 易于编程
- 可扩展性
- 高容错性
- 高吞吐量
- 不适用领域
- 难以实时计算
- 不适合流式计算
二.MapReduce实现WordCount
1.设计思路
2.执行过程
- 数据定义格式
- map: (K1,V1) → list (K2,V2)
- reduce: (K2,list(V2)) → list (K3,V3)
- MapReduce执行过程
- Mapper
- Combiner
- Partitioner
- Shuffle and Sort
- Reducer
3.实现过程
3.1编写java代码
- Mapper
/**
* LongWritable 偏移量 long,表示该行在文件中的位置,而不是行号
* Text map阶段的输入数据 一行文本信息 字符串类型 String
* Text map阶段的数据字符串类型 String
* IntWritable map阶段输出的value类型,对应java中的int型,表示行号
*/
public class WCMapper extends Mapper<LongWritable,Text, Text,IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
/**
* key 输入的 键
* value 输入的 值
* context 上下文对象
*/
String line=value.toString();
String[] words=line.split(" ");
for (String word : words) {
context.write(new Text(word),new IntWritable(1));
}
}
}
- Reducer
public class WCReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
/**
* Text 数据类型:字符串类型 String
* IntWritable reduce阶段的输入类型 int
* Text reduce阶段的输出数据类型 String类型
* IntWritable 输出词频个数 Int型
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
/**
* key 输入的 键
* value 输入的 值
* context 上下文对象,用于输出键值对
*/
int total=0;
for (IntWritable value : values) {
total+=value.get();
}
context.write(key,new IntWritable(total)); }
}
- Partitioner
public class WCPartitioner extends Partitioner<Text, IntWritable> {
@Override
public int getPartition(Text text, IntWritable intWritable, int i) {
return Math.abs(text.hashCode()%i);
}
}
- Job
public class WCDriver {
public static void main(String[] args) throws Exception {
//1.建立连接
Configuration cfg=new Configuration();
Job job=Job.getInstance(cfg,"job_wc");
job.setJarByClass(WCDriver.class);
//2.指定mapper和reducer
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
//指定mapper输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//指定partitioner
job.setNumReduceTasks(4);
job.setPartitionerClass(WCPartitioner.class);
//指定reducer输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//指定输入输出路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));//这里的文件夹运行之前不能存在
//3.运行
boolean result=job.waitForCompletion(true);
System.out.println(result?"成功":"失败");
System.exit(result?0:1);
}
}
3.2 打jar包并上传到linux
打jar包在前文已详细叙述过,如有需要请点链接查看
链接: link.
3.3 执行M/R Job
hadoop jar testown.jar cn.kgc.kb09.mr.WCDriver /test/a.txt /test/haha
三.MapReduce实现join操作
实现代码所需文件:链接: link. 提取码: wvy4
1. Writable
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class CustomOrder implements Writable {
private String customId;
private String customName;
private String orderId;
private String orderStatus;
private String tableFlag;//为0时是custom表,为1时是order表
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(customId);
out.writeUTF(customName);
out.writeUTF(orderId);
out.writeUTF(orderStatus);
out.writeUTF(tableFlag);
}
@Override
public void readFields(DataInput in) throws IOException {
this.customId=in.readUTF();
this.customName=in.readUTF();
this.orderId=in.readUTF();
this.orderStatus=in.readUTF();
this.tableFlag=in.readUTF();
}
public String getCustomId() {
return customId;
}
public void setCustomId(String customId) {
this.customId = customId;
}
public String getCustomName() {
return customName;
}
public void setCustomName(String customName) {
this.customName = customName;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getOrderStatus() {
return orderStatus;
}
public void setOrderStatus(String orderStatus) {
this.orderStatus = orderStatus;
}
public String getTableFlag() {
return tableFlag;
}
public void setTableFlag(String tableFlag) {
this.tableFlag = tableFlag;
}
@Override
public String toString() {
return "customId='" + customId + '\'' +
", customName='" + customName + '\'' +
", orderId='" + orderId + '\'' +
", orderStatus='" + orderStatus +'\'';
}
}
2.Mapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class COMapperJoin extends Mapper<LongWritable, Text,Text,CustomOrder> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line=value.toString();
String[] columns = line.split(",");
for (int i = 0; i < columns.length; i++) {
columns[i]=columns[i].split("\"")[1];
}
CustomOrder co=new CustomOrder();
if(columns.length==4){//order表
co.setCustomId(columns[2]);
co.setCustomName("");
co.setOrderId(columns[0]);
co.setOrderStatus(columns[3]);
co.setTableFlag("1");
}else if(columns.length==9){//custom表
co.setCustomId(columns[0]);
co.setCustomName(columns[1]+"·"+columns[2]);
co.setOrderId("");
co.setOrderStatus("");
co.setTableFlag("0");
}
context.write(new Text(co.getCustomId()),co);
}
}
3.Reducer
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class COReducerJoin extends Reducer<Text,CustomOrder,CustomOrder, NullWritable> {
@Override
protected void reduce(Text key, Iterable<CustomOrder> values, Context context) throws IOException, InterruptedException {
StringBuffer orderIds=new StringBuffer();
StringBuffer statuses=new StringBuffer();
CustomOrder customOrder=new CustomOrder();
for (CustomOrder co : values) {
if(co.getCustomName().equals("")) {
orderIds.append(co.getOrderId()+"|");
statuses.append(co.getOrderStatus()+"|");
}else {
customOrder.setCustomId(co.getCustomId());
customOrder.setCustomName(co.getCustomName());
}
} String orderId="";
String status="";
if(orderIds.length()>0){
orderId=orderIds.substring(0,orderIds.length()-1);}
if(statuses.length()>0){
status=statuses.substring(0,statuses.length()-1);}
customOrder.setOrderId(orderId);
customOrder.setOrderStatus(status);
context.write(customOrder,NullWritable.get());
}
}
4.Job
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CODriver {
public static void main(String[] args)throws Exception {
Configuration cfg=new Configuration();
Job job=Job.getInstance(cfg,"co_job");
job.setJarByClass(CODriver.class);
job.setMapperClass(COMapperJoin.class);
job.setReducerClass(COReducerJoin.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(CustomOrder.class);
job.setOutputKeyClass(CustomOrder.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job,new Path("file:///" +
"D:/IDEA/Data/newdata/testown/data"));
FileOutputFormat.setOutputPath(job,new Path("file:///E:/test/coResult1/"));
boolean result=job.waitForCompletion(true);
System.out.println(result?"执行成功":"执行失败");
System.exit(result?0:1);
}
}
5.实现结果