1.缺失值处理
如果一行数据超过三个空 则剔除数据,最终输出剔除记录数
程序代码:
package org.mapreduce;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class qqsz1 {
public static void main(String[]args)throws IOException,InterruptedException,ClassNotFoundException{
Job job= Job.getInstance();
job.setJobName("去点点");
job.setJarByClass(qqsz1.class);
job.setMapperClass(domap.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path in=new Path("输入文件地址");
Path out=new Path("输出地址");
FileInputFormat.addInputPath(job,in);
FileOutputFormat.setOutputPath(job,out);
System.exit(job.waitForCompletion(true)?0:1);
}
public static class domap extends Mapper<Object, Text,Text,Text>{
public void map(Object key,Text value,Context context)throws IOException,InterruptedException{
if(key.toString().equals("0")){
return;
}
String [] line=value.toString().split("\t");
for(String aa:line ){
int count=0;
if(aa.trim().equals("")){
count++;
}
if(count>1){
context.getCounter("剔除数据","条数").increment(1);
return;
}
}
context.write(value,new Text(""));
}
}
}
2.数据去重
程序代码:
package org.mapreduce;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class qcfz {
public static void main(String[]args)throws IOException,InterruptedException,ClassNotFoundException {
Job job= Job.getInstance();
job.setJarByClass(qcfz.class);
job.setJobName("去重");
job.setMapperClass(domap.class);
job.setReducerClass(doreduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path in=new Path("输入文件地址");
Path out=new Path("输出结果地址");
FileInputFormat.addInputPath(job,in);
FileOutputFormat.setOutputPath(job,out);
System.exit(job.waitForCompletion(true)?0:1);
}
public static class domap extends Mapper<Object, Text,Text,Text>{
public void map(Object key,Text value,Context context)throws IOException,InterruptedException{
Text line=value;
context.write(line,new Text(""));
}
}
public static class doreduce extends Reducer<Text,Text,Text,Text>{
public void reduce(Text key,Iterable<Text> value,Context context)throws IOException,InterruptedException{
context.write(key,new Text(""));
}
}
}
本文介绍了如何使用Java MapReduce处理数据。首先讲解了如何通过MapReduce程序剔除数据中超过三个空格的行,以此进行缺失值处理,并给出相应代码。接着,文章讨论了数据去重的MapReduce实现,同样提供了相应的程序代码,帮助理解MapReduce在数据处理中的应用。
1353

被折叠的 条评论
为什么被折叠?



