数据倾斜:大量数据涌向到一个或者几个reduce,造成大量的reduce空闲。
解决数据倾斜方案1:重新设计key---二次作业
下面以单次统计为例进行说明:
1、DataLeanMapper1 对key重新设计,增加随机数后缀
package hadoop.lean.key;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Random;
/**
* WordCountMapper
*/
public class DataLeanMapper1 extends Mapper<LongWritable, Text, Text,IntWritable> {
Random r = new Random();
/**
* 每一行
*/
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] arr = line.split(" ");
Text keyOut = new Text();
IntWritable valueOut = new IntWritable(1);
for(String word : arr){
//对key重新设计,增加随机数后缀
keyOut.set(word + "_" + r.nextInt(100));
context.write(keyOut,valueOut);
}
}
}
package hadoop.lean.key;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Random;
/**
* WordCountMapper-2
* 切割单词,去除_后缀
*/
public class DataLeanMapper2 extends Mapper<Text, Text, Text,IntWritable> {
Random r = new Random();
/**
* 每一行
*/
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
String word = key.toString();
int index = word.lastIndexOf("_") ;
word = word.substring(0,index) ;
int count = Integer.parseInt(value.toString());
context.write(new Text(word) , new IntWritable(count));
}
}
3、WordCountReducer
package hadoop.lean.key;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* WordCountReducer
*/
public class DataLeanReducer1 extends Reducer<Text, IntWritable, Text, IntWritable>{
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0 ;
for(IntWritable iw : values){
count = count + iw.get() ;
}
context.write(key,new IntWritable(count));
}
}
4、App 数据倾斜解决办法需要二次作业
package hadoop.lean.key;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 数据倾斜解决办法需要二次作业
*/
public class App {
public static void main(String[] args) throws Exception {
args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out1", "d:/java/mr/out2"} ;
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]),true);
}
Job job = Job.getInstance(conf);
job.setJobName("WordCount-1");
job.setJarByClass(App.class);
job.setMapperClass(DataLeanMapper1.class);
job.setReducerClass(DataLeanReducer1.class);
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置mapreduce输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(3);
//第一个阶段(job)
if(job.waitForCompletion(true)){
job = Job.getInstance(conf);
job.setJobName("WordCount-2");
job.setJarByClass(App.class);
job.setMapperClass(DataLeanMapper2.class);
job.setReducerClass(DataLeanReducer1.class);
//添加输入路径
FileInputFormat.addInputPath(job, new Path(args[1]));
//设置输出路径
FileOutputFormat.setOutputPath(job, new Path(args[2]));
//第一次的输出是第二次的输入,首次输出的key - value
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(3);
job.waitForCompletion(true);
}
}
}