package com.mzsx.hadoop;
import java.io.IOException;
import java.util.Random;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MySortWordCount {
public static class MyMapper extends
Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);// 类似于int类型
private Text word = new Text(); // 可以理解成String类型
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
System.err.println(key + "," + value);
// 默认情况下即根据空格分隔字符串
String tmp=value.toString();
tmp=tmp.replace('\'', ' ');
tmp=tmp.replace('.', ' ');
tmp=tmp.replace(',', ' ');
tmp=tmp.replace(':', ' ');
tmp=tmp.replace('!', ' ');
tmp=tmp.replace(';', ' ');
tmp=tmp.replace('?', ' ');
tmp=tmp.replace('`', ' ');
tmp=tmp.replace('"', ' ');
tmp=tmp.replace('&', ' ');
tmp=tmp.replace('(', ' ');
tmp=tmp.replace(')', ' ');
tmp=tmp.replace('-', ' ');
StringTokenizer itr = new StringTokenizer(tmp);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
};
}
// Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
public static class MyReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
System.err.println(key + "," + values);
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
;
context.write(key, result);// 这是最后结果
};
}
public static class SortMapper extends Mapper<Object, Text, IntWritable,Text>{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
IntWritable times = new IntWritable(1);
Text password = new Text();
String eachline=value.toString();
String[] eachterm =eachline.split("\t");
password.set(eachterm[0]);
times.set(Integer.parseInt(eachterm[1]));
context.write(times,password);
}
}
public static class SortReducer extends Reducer<IntWritable,Text,IntWritable,Text> {
private Text password = new Text();
public void reduce(IntWritable key,Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text val : values) {
password.set(val);
context.write(key,password);
}
}
}
private static class IntDecreasingComparator extends IntWritable.Comparator {
public int compare(WritableComparable a, WritableComparable b) {
//return -super.compare(a, b);
return super.compare(a, b);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
//return -super.compare(b1, s1, l1, b2, s2, l2);
return super.compare(b1, s1, l1, b2, s2, l2);
}
}
public static void main(String[] args) throws Exception {
// 声明配置信息
Configuration conf = new Configuration();
// 声明Job
Job job = new Job(conf, "Word Count");
// 设置工作类
job.setJarByClass(MySortWordCount.class);
// 设置mapper类
job.setMapperClass(MyMapper.class);
// 可选
job.setCombinerClass(MyReducer.class);
// 设置合并计算类
job.setReducerClass(MyReducer.class);
// 设置key为String类型
job.setOutputKeyClass(Text.class);
// 设置value为int类型
job.setOutputValueClass(IntWritable.class);
//job.setInputFormatClass(KeyValueTextInputFormat.class);
// 设置或是接收输入输出
/*FileInputFormat.setInputPaths(job, new Path("/user/root/aoman.txt"));
FileOutputFormat.setOutputPath(job, new Path("/user/root/r3"));
// 执行
System.exit(job.waitForCompletion(true) ? 0 : 1);*/
//定义一个临时目录,先将词频统计任务的输出结果写到临时目录中, 下一个排序任务以临时目录为输入目录。
FileInputFormat.addInputPath(job, new Path("/user/root/aoman.txt"));
Path tempDir = new Path("MySortWordCount-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
FileOutputFormat.setOutputPath(job, tempDir);
if(job.waitForCompletion(true))
{
Job sortJob = new Job(conf, "csdnsort");
sortJob.setJarByClass(MySortWordCount.class);
FileInputFormat.addInputPath(sortJob, tempDir);
sortJob.setMapperClass(SortMapper.class);
FileOutputFormat.setOutputPath(sortJob, new Path("/user/root/sort1"));
sortJob.setOutputKeyClass(IntWritable.class);
sortJob.setOutputValueClass(Text.class);
sortJob.setSortComparatorClass(IntDecreasingComparator.class);
FileSystem.get(conf).deleteOnExit(tempDir);
System.exit(sortJob.waitForCompletion(true) ? 0 : 1);
}
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
转载于:https://blog.51cto.com/qiangmzsx/1404661