需求:通过MapReduce对红楼梦TXT文件统计笑、喜、哭、怒在全书的数量,使用combiner减少IO,通过partition输出到两个文件中。
通过MapReduce插件创建MapReduce project,这样需要的包都会自动导入
主函数:
package com.zhiyou100;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class MyApp {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.output.textoutputformat.separator", ":");
Path inputPath = new Path("hdfs://master:9000/mark/hlm-utf8.txt");
Path outputPath = new Path("hdfs://master:9000/result/hml02");
FileSystem fs = FileSystem.newInstance(conf);
// 如果文件已存在就删除
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
fs.close();
// job相当于一个model
Job job = Job.getInstance(conf, "HLM");
job.setJarByClass(MyApp.class);
// 指定输入目录
FileInputFormat.addInputPath(job, inputPath);
// 指定对输入数据进行格式化处理的类(可以省略)
job.setInputFormatClass(TextInputFormat.class);
// 指定自定义的Mapper类
job.setMapperClass(MyMapper.class);
// map输入
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 分区(可以省略)
job.setPartitionerClass(MyPartition.class);
// 设置要运行的Reducer的数量(可以省略)
job.setNumReduceTasks(2);
// 指定自定义的Reducer类
job.setReducerClass(MyReducer.class);
job.setCombinerClass(MyCombiner.class);
// 泛型类在编译时会被当成?所以要指定
// 指定map输出的<K,V>类型(如果<k3,v3>的类型与<k2,v2>的类型一致则可以省略)
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 指定输出目录
FileOutputFormat.setOutputPath(job, outputPath);
// 指定对输出数据进行格式化处理的类(可以省略)
job.setOutputFormatClass(TextOutputFormat.class);
// 把任务提交到集群,轮寻方式
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
mapper:
package com.zhiyou100;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
static {
System.out.println("my_-mapper");
}
private IntWritable num = new IntWritable();
private Text word = new Text();
private int no = 0;
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer st = new StringTokenizer(value.toString(), "《 》 、 ! , 。 ? :; “ ” ‘ ’ ");
while (st.hasMoreElements()) {
String text = st.nextElement().toString().trim();
no += 1;
context.getCounter("ZY", "statement").increment(1);
if (text.contains("笑")) {
word.set("笑");
num.set(no);
context.write(word, num);
}
if (text.contains("喜")) {
word.set("喜");
num.set(no);
context.write(word, num);
}
if (text.contains("哭")) {
word.set("哭");
num.set(no);
context.write(word, num);
}
if (text.contains("怒")) {
word.set("怒");
num.set(no);
context.write(word, num);
}
}
}
}
reduce:
package com.zhiyou100;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
static {
System.out.println("my_-reducer");
}
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
combiner:
package com.zhiyou100;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
static {
System.out.println("my_-combiner");
}
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable val : values) {
sum += 1;
}
result.set(sum);
context.write(key, result);
}
}
partition:
package com.zhiyou100;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class MyPartition extends Partitioner<Text, IntWritable>{
static {
System.out.println("my_-partition");
}
@Override
public int getPartition(Text key, IntWritable value, int numPartitions) {
if(key.toString().contains("笑") || key.toString().contains("喜")) {
return 0;
}else {
return 1;
}
}
}
输出结果:
通过每个类上定义的静态方法打印的日志也可以看出job在调用MapReduce 及 combiner partition的先后顺序
my_-mapper –> my_-combiner –> my_-partition –> my_-reducer