MR案例之手机流量
序列化原理
Eclipse快捷键:
main方法快捷:main+alt+/
输出快捷:sysout+alt+/
FlowBean.java
package com.matrix.flowsum;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
*
*
* FlowBean<BR>
* 创建人:Matrix <BR>
* 时间:2016年3月10日-上午8:56:04 <BR>
*
* @version 1.0.0
*
*/
public class FlowBean implements Writable {
// 为什么要实现?Writable
// 实现Writable接口后,可以重写序列化和反序列化接口
// 手机号码
private String phoneNB;
// 上行流量
private long up_flow;
// 下行流量
private long d_flow;
// 总流量
private long s_flow;
// 空参构造函数
// 在反序列化时,反射机制需要调用空参构造函数,所以显示定义了一个空参构造函数
public FlowBean() {
super();
}
// 为了方便数据初始化方便加入一个带参的构造函数
// 有参构造函数
public FlowBean(String phoneNB, long up_flow, long d_flow) {
super();
this.phoneNB = phoneNB;
this.up_flow = up_flow;
this.d_flow = d_flow;
this.s_flow = up_flow + d_flow;
}
public String getPhoneNB() {
return phoneNB;
}
public void setPhoneNB(String phoneNB) {
this.phoneNB = phoneNB;
}
public long getUp_flow() {
return up_flow;
}
public void setUp_flow(long up_flow) {
this.up_flow = up_flow;
}
public long getD_flow() {
return d_flow;
}
public void setD_flow(long d_flow) {
this.d_flow = d_flow;
}
public long getS_flow() {
return s_flow;
}
public void setS_flow(long s_flow) {
this.s_flow = s_flow;
}
// 将对象序列化到流中
// 把bean中的数据序列化到数据输出流中去
@Override
public void write(DataOutput out) throws IOException {
// 手机号码
out.writeUTF(phoneNB);
// 上行流量
out.writeLong(up_flow);
// 下行流量
out.writeLong(d_flow);
// 总流量
out.writeLong(s_flow);
}
// 从数据流中反序列出对象的数据
// 从数据读入流读出对象字段时,必须要跟序列化顺序保持一致
@Override
public void readFields(DataInput in) throws IOException {
// 手机号码
phoneNB = in.readUTF();
// 上行流量
up_flow = in.readLong();
// 下行流量
d_flow = in.readLong();
// 总流量
s_flow = in.readLong();
}
@Override
public String toString() {
return "【手机号码:" + phoneNB + " 上行流量:" + up_flow + " 下行流量:" + d_flow + " 总流量:" + s_flow+"】";
}
}
FlowSumMapper.java
package com.matrix.flowsum;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* LongWritable(流量), Text,Text,FlowBean(自定义类)
*
* FlowSumMapper<BR>
* 创建人:Matrix <BR>
* 时间:2016年3月10日-上午8:45:34 <BR>
*
* @version 1.0.0
*
*/
public class FlowSumMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
// FlowBean是我们自定义的一种数据类型,要在hadoop的各个节点之间传输,应该遵循hadoop的序列化机制,就必须实现hadoop相应的序列化接口LongWritable
// 拿到日志中的一行数据,然后抽取出需要的一行字段,手机号、上行流量、下行流量,然后封装成kv发送出去
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 拿一行数据,并切分成各个字段
// String[] line = value.toString().split(" ");
String line = value.toString();
// 按照"\t"制表符切分字符串
String[] field = StringUtils.split(line, "\t");
// 拿到需要的数据
// 获取手机号码
String phoneNB = field[1];
// 获取上行流量
long up_flow = Long.parseLong(field[7]);
// 获取下行流量
long d_flow = Long.parseLong(field[8]);
// 封装数据为键值对Key/Value
System.out.println(phoneNB + "-------->" + new FlowBean(phoneNB, up_flow, d_flow).toString());
context.write(new Text(phoneNB), new FlowBean(phoneNB, up_flow, d_flow));
}
}
FlowSumReducer.java
package com.matrix.flowsum;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowSumReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
// reduce的业务逻辑就是遍历values,然后累加求和再输出
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context)
throws IOException, InterruptedException {
long up_flow_counter = 0;
long d_flow_counter = 0;
// 求相同手机号的上行流量和、下行流量和
for (FlowBean flowBean : values) {
up_flow_counter += flowBean.getUp_flow();
d_flow_counter += flowBean.getD_flow();
}
// 输出
context.write(key, new FlowBean(key.toString(), up_flow_counter, d_flow_counter));
}
}
FlowSumRunner.java
package com.matrix.flowsum;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.io.Text;
/**
* Job提交和规范写法
*
* FlowSumRunner<BR>
* 创建人:Matrix <BR>
* 时间:2016年3月10日-下午4:24:29 <BR>
*
* @version 1.0.0
*
*/
public class FlowSumRunner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://node1:8020");
FileSystem fs = FileSystem.get(//
new URI("hdfs://node1:8020"), //
conf, //
"root"//
);
Job job = Job.getInstance(conf);
// 设置用户名称
job.setJobName("flowsum");
job.setJarByClass(FlowSumRunner.class);
job.setMapperClass(FlowSumMapper.class);
job.setReducerClass(FlowSumReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
// job.setNumReduceTasks(3);
FileInputFormat.setInputPaths(job, new Path("/usr/matrix/input/flowsum.dat"));
// 针对目录进行判断
Path outdir = new Path("/usr/matrix/output/flowsum");
if (fs.exists(outdir)) {
fs.delete(outdir, true);
}
// 设置输出数据目录
// path 一个目录 而且不能存在
FileOutputFormat.setOutputPath(job, outdir);
boolean f = job.waitForCompletion(true);
if (f) {
System.out.println("WordCount程序运行成功!");
}
}
}
运行结果:
处理之后的文件内容:
自定义排序实现
针对文件内容实现自定义排序
SortMR.java
package com.matrix.SortMR;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.matrix.flowsum.FlowBean;
public class SortMR {
public static class SortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable> {
// 拿到一行数据切分出各个字段,封装为一个flowbean,作为key输出
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 拿到一行数据
String line = value.toString();
// 且各个字段
String[] fields = StringUtils.split(line, "\t");
// 手机号码
String phoneNB = fields[0];
// 上行流量
long up_flow = Long.parseLong(fields[1]);
// 下行流量
long d_flow = Long.parseLong(fields[2]);
context.write(new FlowBean(phoneNB, up_flow, d_flow), NullWritable.get());
}
}
public static class SortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean> {
@Override
protected void reduce(FlowBean key, Iterable<NullWritable> value, Context context)
throws IOException, InterruptedException {
System.out.println(key);
String phoneNB = key.getPhoneNB();
context.write(new Text(phoneNB), key);
}
}
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://node1:8020");
try {
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(SortMR.class);
job.setJobName("sortMR");
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
// 设置数据输入路径
FileInputFormat.setInputPaths(job, "/usr/matrix/input/sort");
// 设置输出输出路径
Path outer = new Path("/usr/matrix/output/sortMR");
// 如果存在路径
if (fs.exists(outer)) {
fs.delete(outer, true);
}
FileOutputFormat.setOutputPath(job, outer);
boolean f = job.waitForCompletion(true);
if (f) {
System.out.println("程序运行成功!");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
运行结果:
处理过后的文件: