MapReduce数据倾斜解决方案1--1、重新设计key---二次作业

最新推荐文章于 2023-04-02 10:19:55 发布

原创最新推荐文章于 2023-04-02 10:19:55 发布 · 1.3k 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#hadoop #mapreduce #数据倾斜

Hadoop实战专栏收录该内容

46 篇文章

订阅专栏

本文介绍了一种通过二次作业解决Hadoop中数据倾斜问题的方法。首先，在Map阶段为每个单词添加随机后缀，使得数据均匀分布；然后，在第二个MapReduce作业中移除随机后缀并汇总单词计数。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

数据倾斜：大量数据涌向到一个或者几个reduce，造成大量的reduce空闲。

解决数据倾斜方案1：重新设计key---二次作业

下面以单次统计为例进行说明:

1、DataLeanMapper1 对key重新设计，增加随机数后缀

package hadoop.lean.key;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Random;

/**
 * WordCountMapper
 */
public class DataLeanMapper1 extends Mapper<LongWritable, Text, Text,IntWritable> {
	Random r = new Random();

	/**
	 * 每一行
	 */
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		String line = value.toString();
		String[] arr = line.split(" ");

		Text keyOut = new Text();
		IntWritable valueOut = new IntWritable(1);
		for(String word : arr){
			//对key重新设计，增加随机数后缀
			keyOut.set(word + "_" + r.nextInt(100));
			context.write(keyOut,valueOut);
		}
	}
}

2、WordCountMapper-2 切割单词,去除_后缀

package hadoop.lean.key;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Random;

/**
 * WordCountMapper-2
 * 切割单词,去除_后缀
 */
public class DataLeanMapper2 extends Mapper<Text, Text, Text,IntWritable> {
	Random r = new Random();

	/**
	 * 每一行
	 */
	protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
		String word = key.toString();
		int index = word.lastIndexOf("_") ;
		word = word.substring(0,index) ;

		int count = Integer.parseInt(value.toString());
		context.write(new Text(word) , new IntWritable(count));

	}
}

3、WordCountReducer

package hadoop.lean.key;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * WordCountReducer
 */
public class DataLeanReducer1 extends Reducer<Text, IntWritable, Text, IntWritable>{

	protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
		int count = 0 ;
		for(IntWritable iw : values){
			count = count + iw.get() ;
		}
		context.write(key,new IntWritable(count));
	}
}

4、App 数据倾斜解决办法需要二次作业

package hadoop.lean.key;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 数据倾斜解决办法需要二次作业
 */
public class App {
	public static void main(String[] args) throws Exception {
		args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out1", "d:/java/mr/out2"} ;
		Configuration conf = new Configuration();

		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path(args[1]))){
			fs.delete(new Path(args[1]),true);
		}

		Job job = Job.getInstance(conf);

		job.setJobName("WordCount-1");
		job.setJarByClass(App.class);

		job.setMapperClass(DataLeanMapper1.class);
		job.setReducerClass(DataLeanReducer1.class);

		//添加输入路径
		FileInputFormat.addInputPath(job,new Path(args[0]));
		//设置输出路径
		FileOutputFormat.setOutputPath(job,new Path(args[1]));

		//设置mapreduce输出
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		job.setNumReduceTasks(3);

		//第一个阶段(job)
		if(job.waitForCompletion(true)){
			job = Job.getInstance(conf);

			job.setJobName("WordCount-2");
			job.setJarByClass(App.class);

			job.setMapperClass(DataLeanMapper2.class);
			job.setReducerClass(DataLeanReducer1.class);

			//添加输入路径
			FileInputFormat.addInputPath(job, new Path(args[1]));
			//设置输出路径
			FileOutputFormat.setOutputPath(job, new Path(args[2]));
			//第一次的输出是第二次的输入，首次输出的key - value
			job.setInputFormatClass(KeyValueTextInputFormat.class);

			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);

			job.setNumReduceTasks(3);
			job.waitForCompletion(true);
		}
	}
}