package com.zhiyou.db23;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class DuplicateRemove {
//定义一个map类继承自Mapper类,在该类的map方法上实行map数据处理的业务逻辑
public static class DuplicateRemoveMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
private Text outputKey = new Text();
private NullWritable outputValue =NullWritable.get();
private static String[] info;
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
info=value.toString().split("\t");//以"\t"形式分离
if(info.length ==3) {
outputKey.set(info[0]);
context.write(outputKey, outputValue);
}
}
}
//定义一个reducer类继承自Reducer,在该类的reducer方法上实现reducer数据聚合的业务逻辑
public static class DuplicateRemoveReduce extends Reducer<Text, NullWritable, Text, NullWritable>{
private NullWritable outputValue = NullWritable.get();
@Override
protected void reduce(Text key, Iterable<NullWritable> values,
Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
context.write(key, outputValue);
}
}
//定义job对象,并在job对象上进行配置job的各项参数,其中包括map类的配置,reducer类的配置配置结束后启动job
//在一个job里面可以有多个map和一个reducer,一个job里面可以没有reducer(job.setNumReducerTasks(0)),但不能没有map
//map在一个mr任务中主要承担:数据的解析,数据的转换等任务
//reducer在一个mr中主要承担:数据的聚合,数据的排序等操作
//定义map首先要确定map的输出和kv类型在map逻辑的方法中从参数中获取kv,使用context.write输出
//定义reducer也要先确定输入和输出kv类型,在reducer逻辑聚合方法中从参数中获取kv组,使用context.write输出
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//定义一个job:
//使用Job.getInstance实例化一个jib像
Job job =Job.getInstance(conf);
//设置job的jar包确定类名setJarByClass,以及job的名称setJobName
job.setJarByClass(DuplicateRemove.class);
job.setJobName("去重...");
//设置map类
job.setMapperClass(DuplicateRemoveMapper.class);
//设置reducer类
job.setReducerClass(DuplicateRemoveReduce.class);
//设置key的输出类型
job.setOutputKeyClass(Text.class);
//设置输出value的类型
job.setOutputValueClass(NullWritable.class);
//设置输入的格式InputFormat和输出格式OutputFormat格式如果不设置默认文本格式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//设置输入的路径
Path inputpath = new Path(args[0]);
FileInputFormat.addInputPath(job, inputpath);
//设置输出的路径
Path outputpath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outputpath);
//删一下路径,防止上传时已有路径发生错误
outputpath.getFileSystem(conf).delete(outputpath, true);
//启动job;job.waitForCompletion(true?0:1)
System.exit(job.waitForCompletion(true)?0:1);
}
}
HADOOP去除重复
最新推荐文章于 2024-05-11 19:12:00 发布