import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class NewVersionWordCount {
public static class MapClass extends Mapper<LongWritable, Text, Text, IntWritable>{
private final static IntWritable one=new IntWritable(1);
private Text word=new Text();
protected void setup(Context context){
System.out.println("mapper setup:"+context.getJobName());
}
public void map(LongWritable key,Text value,Context context){//map函数
String line=value.toString();
StringTokenizer tokenizer=new StringTokenizer(line);
try{
while(tokenizer.hasMoreTokens()){
word.set(tokenizer.nextToken());
context.write(word, one);
}}catch (Exception e) {
e.printStackTrace();
}
}
protected void cleanup(Context context){
System.out.println("mapper cleanup:"+context.getUser());
}
}
public static class ReduceClass extends Reducer<Text, IntWritable, Text, IntWritable>{
protected void setup(Context context){
System.out.println("reduce setup:"+context.getUser());
}
public void reduce(Text key,Iterable<IntWritable> values,Context context){//reduce函数
int sum=0;
for(IntWritable value:values){
sum+=value.get();
}
try{
context.write(key, new IntWritable(sum));
}catch (Exception e) {
e.printStackTrace();
}
}
protected void cleanup(Context context){
System.out.println("reduce cleanup:"+context.getUser());
}
}
public static void main(String[] args)throws Exception {
String inputPath="/user/User/input";
String outPath="/user/User/output";
Configuration conf=new Configuration();
Job job=new Job(conf);
job.setJobName("wordCount");
job.setJarByClass(NewVersionWordCount.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(ReduceClass.class);
job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
/*FileSystem fileSystem=FileSystem.get(conf);
if(fileSystem.exists(new Path(outPath))){
fileSystem.delete(new Path(outPath),true);
}*/
FileInputFormat.addInputPath(job,new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outPath));
boolean complete=job.waitForCompletion(true);
if(!complete){
throw new RuntimeException();
}
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class NewVersionWordCount {
public static class MapClass extends Mapper<LongWritable, Text, Text, IntWritable>{
private final static IntWritable one=new IntWritable(1);
private Text word=new Text();
protected void setup(Context context){
System.out.println("mapper setup:"+context.getJobName());
}
public void map(LongWritable key,Text value,Context context){//map函数
String line=value.toString();
StringTokenizer tokenizer=new StringTokenizer(line);
try{
while(tokenizer.hasMoreTokens()){
word.set(tokenizer.nextToken());
context.write(word, one);
}}catch (Exception e) {
e.printStackTrace();
}
}
protected void cleanup(Context context){
System.out.println("mapper cleanup:"+context.getUser());
}
}
public static class ReduceClass extends Reducer<Text, IntWritable, Text, IntWritable>{
protected void setup(Context context){
System.out.println("reduce setup:"+context.getUser());
}
public void reduce(Text key,Iterable<IntWritable> values,Context context){//reduce函数
int sum=0;
for(IntWritable value:values){
sum+=value.get();
}
try{
context.write(key, new IntWritable(sum));
}catch (Exception e) {
e.printStackTrace();
}
}
protected void cleanup(Context context){
System.out.println("reduce cleanup:"+context.getUser());
}
}
public static void main(String[] args)throws Exception {
String inputPath="/user/User/input";
String outPath="/user/User/output";
Configuration conf=new Configuration();
Job job=new Job(conf);
job.setJobName("wordCount");
job.setJarByClass(NewVersionWordCount.class);
job.setMapperClass(MapClass.class);
job.setReducerClass(ReduceClass.class);
job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
/*FileSystem fileSystem=FileSystem.get(conf);
if(fileSystem.exists(new Path(outPath))){
fileSystem.delete(new Path(outPath),true);
}*/
FileInputFormat.addInputPath(job,new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outPath));
boolean complete=job.waitForCompletion(true);
if(!complete){
throw new RuntimeException();
}
}
}
本文介绍了一个使用Hadoop实现的WordCount程序新版本。该程序通过MapReduce框架处理大量文本数据,统计每个单词出现的频率。具体包括Mapper和Reducer两个阶段的详细实现过程,并展示了如何设置作业参数以及配置输入输出路径。
5767

被折叠的 条评论
为什么被折叠?



